From 454285c7014001e34991657205e8d805769d8758 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:01:49 +0800
Subject: [PATCH 01/16] [T1-1-1]: register new operators in framework

---
 include/infiniop.h                       |   9 +
 scripts/python_test.py                   |   9 +
 src/infiniop-test/include/ops.hpp        |  18 ++
 test/infiniop/libinfiniop/op_register.py | 287 +++++++++++++++++++++++
 4 files changed, 323 insertions(+)

diff --git a/include/infiniop.h b/include/infiniop.h
index d51b8d92e..c86127cb2 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -16,6 +16,15 @@
 #include "infiniop/ops/rope.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
+#include "infiniop/ops/exp.h"
+#include "infiniop/ops/sin.h"
+#include "infiniop/ops/cos.h"
+#include "infiniop/ops/leakyrelu.h"
+#include "infiniop/ops/tanh.h"
+#include "infiniop/ops/sigmoid_backward.h"
+#include "infiniop/ops/hardswish.h"
+#include "infiniop/ops/cast.h"
+#include "infiniop/ops/where.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/scripts/python_test.py b/scripts/python_test.py
index eb2d4319e..02fd65c63 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -24,6 +24,15 @@ def run_tests(args):
         "rope.py",
         "sub.py",
         "swiglu.py",
+        "exp.py",
+        "sin.py",
+        "cos.py",
+        "leakyrelu.py",
+        "tanh.py",
+        "sigmoid_backward.py",
+        "hardswish.py",
+        "cast.py",
+        "where.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..4c16eeec7 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,15 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(exp)
+DECLARE_INFINIOP_TEST(sin)
+DECLARE_INFINIOP_TEST(cos)
+DECLARE_INFINIOP_TEST(leakyrelu)
+DECLARE_INFINIOP_TEST(tanh)
+DECLARE_INFINIOP_TEST(sigmoid_backward)
+DECLARE_INFINIOP_TEST(hardswish)
+DECLARE_INFINIOP_TEST(cast)
+DECLARE_INFINIOP_TEST(where)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -43,6 +52,15 @@ DECLARE_INFINIOP_TEST(sub)
         REGISTER_INFINIOP_TEST(causal_softmax) \
         REGISTER_INFINIOP_TEST(rearrange)      \
         REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(exp)            \
+        REGISTER_INFINIOP_TEST(sin)            \
+        REGISTER_INFINIOP_TEST(cos)            \
+        REGISTER_INFINIOP_TEST(leakyrelu)      \
+        REGISTER_INFINIOP_TEST(tanh)           \
+        REGISTER_INFINIOP_TEST(sigmoid_backward)\
+        REGISTER_INFINIOP_TEST(hardswish)      \
+        REGISTER_INFINIOP_TEST(cast)           \
+        REGISTER_INFINIOP_TEST(where)          \
     }
 
 namespace infiniop_test {
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e92e77105..86cee0424 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -489,3 +489,290 @@ def conv_(lib):
     lib.infiniopDestroyConvDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+@OpRegister.operator
+def exp_(lib):
+    lib.infiniopCreateExpDescriptor.restype = c_int32
+    lib.infiniopCreateExpDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetExpWorkspaceSize.restype = c_int32
+    lib.infiniopGetExpWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopExp.restype = c_int32
+    lib.infiniopExp.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyExpDescriptor.restype = c_int32
+    lib.infiniopDestroyExpDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def sin_(lib):
+    lib.infiniopCreateSinDescriptor.restype = c_int32
+    lib.infiniopCreateSinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSinWorkspaceSize.restype = c_int32
+    lib.infiniopGetSinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSin.restype = c_int32
+    lib.infiniopSin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySinDescriptor.restype = c_int32
+    lib.infiniopDestroySinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def cos_(lib):
+    lib.infiniopCreateCosDescriptor.restype = c_int32
+    lib.infiniopCreateCosDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetCosWorkspaceSize.restype = c_int32
+    lib.infiniopGetCosWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCos.restype = c_int32
+    lib.infiniopCos.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyCosDescriptor.restype = c_int32
+    lib.infiniopDestroyCosDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def leakyrelu_(lib):
+    lib.infiniopCreateLeakyreluDescriptor.restype = c_int32
+    lib.infiniopCreateLeakyreluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+
+    lib.infiniopGetLeakyreluWorkspaceSize.restype = c_int32
+    lib.infiniopGetLeakyreluWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLeakyrelu.restype = c_int32
+    lib.infiniopLeakyrelu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLeakyreluDescriptor.restype = c_int32
+    lib.infiniopDestroyLeakyreluDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def tanh_(lib):
+    lib.infiniopCreateTanhDescriptor.restype = c_int32
+    lib.infiniopCreateTanhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetTanhWorkspaceSize.restype = c_int32
+    lib.infiniopGetTanhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopTanh.restype = c_int32
+    lib.infiniopTanh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyTanhDescriptor.restype = c_int32
+    lib.infiniopDestroyTanhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def sigmoid_backward_(lib):
+    lib.infiniopCreateSigmoidBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateSigmoidBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSigmoidBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetSigmoidBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSigmoidBackward.restype = c_int32
+    lib.infiniopSigmoidBackward.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySigmoidBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroySigmoidBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def hardswish_(lib):
+    lib.infiniopCreateHardswishDescriptor.restype = c_int32
+    lib.infiniopCreateHardswishDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetHardswishWorkspaceSize.restype = c_int32
+    lib.infiniopGetHardswishWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopHardswish.restype = c_int32
+    lib.infiniopHardswish.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyHardswishDescriptor.restype = c_int32
+    lib.infiniopDestroyHardswishDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def cast_(lib):
+    lib.infiniopCreateCastDescriptor.restype = c_int32
+    lib.infiniopCreateCastDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetCastWorkspaceSize.restype = c_int32
+    lib.infiniopGetCastWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCast.restype = c_int32
+    lib.infiniopCast.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyCastDescriptor.restype = c_int32
+    lib.infiniopDestroyCastDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def where_(lib):
+    lib.infiniopCreateWhereDescriptor.restype = c_int32
+    lib.infiniopCreateWhereDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetWhereWorkspaceSize.restype = c_int32
+    lib.infiniopGetWhereWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopWhere.restype = c_int32
+    lib.infiniopWhere.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyWhereDescriptor.restype = c_int32
+    lib.infiniopDestroyWhereDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]    
\ No newline at end of file

From 381ccbc646594fb86f8b61c2aebfd973ffc3f6e9 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:06:58 +0800
Subject: [PATCH 02/16] [T1-1-1]: Exp operator with cpu nvidia metax iluvatar
 and test

---
 include/infiniop/ops/exp.h                 |  24 +++
 src/infiniop-test/src/ops/exp.cpp          | 115 ++++++++++++++
 src/infiniop/ops/exp/cpu/exp_cpu.cc        |  52 +++++++
 src/infiniop/ops/exp/cpu/exp_cpu.h         |  21 +++
 src/infiniop/ops/exp/cuda/kernel.cuh       |  39 +++++
 src/infiniop/ops/exp/metax/exp_metax.h     |   8 +
 src/infiniop/ops/exp/metax/exp_metax.maca  |  60 ++++++++
 src/infiniop/ops/exp/nvidia/exp_nvidia.cu  |  59 ++++++++
 src/infiniop/ops/exp/nvidia/exp_nvidia.cuh |   8 +
 src/infiniop/ops/exp/operator.cc           | 142 ++++++++++++++++++
 test/infiniop/exp.py                       | 165 +++++++++++++++++++++
 11 files changed, 693 insertions(+)
 create mode 100644 include/infiniop/ops/exp.h
 create mode 100644 src/infiniop-test/src/ops/exp.cpp
 create mode 100644 src/infiniop/ops/exp/cpu/exp_cpu.cc
 create mode 100644 src/infiniop/ops/exp/cpu/exp_cpu.h
 create mode 100644 src/infiniop/ops/exp/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/exp/metax/exp_metax.h
 create mode 100644 src/infiniop/ops/exp/metax/exp_metax.maca
 create mode 100644 src/infiniop/ops/exp/nvidia/exp_nvidia.cu
 create mode 100644 src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
 create mode 100644 src/infiniop/ops/exp/operator.cc
 create mode 100644 test/infiniop/exp.py

diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h
new file mode 100644
index 000000000..624bc5363
--- /dev/null
+++ b/include/infiniop/ops/exp.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_EXP_API_H__
+#define __INFINIOP_EXP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopExpDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle,
+                                                        infiniopExpDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp
new file mode 100644
index 000000000..395408e15
--- /dev/null
+++ b/src/infiniop-test/src/ops/exp.cpp
@@ -0,0 +1,115 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::exp {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopExpDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateExpDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetExpWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopExp(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopExp(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "output", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"output"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::exp
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc
new file mode 100644
index 000000000..58a6d0f2d
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc
@@ -0,0 +1,52 @@
+#include "exp_cpu.h"
+
+namespace op::exp::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ExpOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ExpOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ExpOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ExpOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::cpu
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h
new file mode 100644
index 000000000..fbf9ab126
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __EXP_CPU_H__
+#define __EXP_CPU_H__
+
+#include <cmath>
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(exp, cpu)
+
+namespace op::exp::cpu {
+typedef struct ExpOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::exp(input);
+    }
+} ExpOp;
+} // namespace op::exp::cpu
+
+#endif // __EXP_CPU_H__
diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh
new file mode 100644
index 000000000..316a393be
--- /dev/null
+++ b/src/infiniop/ops/exp/cuda/kernel.cuh
@@ -0,0 +1,39 @@
+#ifndef __EXP_CUDA_H__
+#define __EXP_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+
+namespace op::exp::cuda {
+typedef struct ExpOp {
+  static constexpr size_t num_inputs = 1;
+
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T &input) const {
+    if constexpr (std::is_same_v<T, half2>) {
+        float2 vf = __half22float2(input);
+        float2 vr = make_float2(__expf(vf.x), __expf(vf.y));
+        return __float22half2_rn(vr);
+    } else if constexpr (std::is_same_v<T, half>) {
+        float inputf = __half2float(input);
+        return __float2half_rn(__expf(inputf));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+        float f0 = __bfloat162float(__low2bfloat16(input));
+        float f1 = __bfloat162float(__high2bfloat16(input));
+        return __floats2bfloat162_rn(__expf(f0), __expf(f1));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        float inputf = __bfloat162float(input);
+        return __float2bfloat16_rn(__expf(inputf));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return __expf(input);
+    } else if constexpr (std::is_same_v<T, double>) {
+        return std::exp(input);
+    } else {
+        return std::exp(input);
+    }
+  }
+} ExpOp;
+} // namespace
+
+#endif // __EXP_CUDA_H__
diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h
new file mode 100644
index 000000000..fb10faf9b
--- /dev/null
+++ b/src/infiniop/ops/exp/metax/exp_metax.h
@@ -0,0 +1,8 @@
+#ifndef __EXP_METAX_API_H__
+#define __EXP_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(exp, metax)
+
+#endif // __EXP_METAX_API_H__
diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca
new file mode 100644
index 000000000..c71703c6d
--- /dev/null
+++ b/src/infiniop/ops/exp/metax/exp_metax.maca
@@ -0,0 +1,60 @@
+#include "exp_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::exp::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::metax
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
new file mode 100644
index 000000000..f4229a942
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "exp_nvidia.cuh"
+
+namespace op::exp::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::nvidia
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
new file mode 100644
index 000000000..7545e8f3e
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __EXP_CUDA_API_H__
+#define __EXP_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(exp, nvidia)
+
+#endif // __EXP_CUDA_API_H__
diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc
new file mode 100644
index 000000000..56f5d29cd
--- /dev/null
+++ b/src/infiniop/ops/exp/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/exp.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/exp_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/exp_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/exp_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateExpDescriptor(
+    infiniopHandle_t handle,
+    infiniopExpDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::exp::NAMESPACE::Descriptor::create(                      \
+            handle,                                                         \
+            reinterpret_cast<op::exp::NAMESPACE::Descriptor **>(desc_ptr),  \
+            output_desc,                                                         \
+            {input_desc})                                                       \
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::exp::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopExp(
+    infiniopExpDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+    delete reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py
new file mode 100644
index 000000000..eb139af12
--- /dev/null
+++ b/test/infiniop/exp.py
@@ -0,0 +1,165 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT    = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def exp(output, input):
+    output.copy_(torch.exp(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    exp(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateExpDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetExpWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_exp():
+        check_error(
+            LIBINFINIOP.infiniopExp(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_exp()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 3e1f1dc1f6a616b29657f96b144c93bb445a4a3e Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:08:39 +0800
Subject: [PATCH 03/16] [T1-1-1]: Sin operator with cpu nvidia metax iluvatar
 and test

---
 include/infiniop/ops/sin.h                 |  24 +++
 src/infiniop-test/src/ops/sin.cpp          | 114 ++++++++++++++
 src/infiniop/ops/sin/cpu/sin_cpu.cc        |  52 +++++++
 src/infiniop/ops/sin/cpu/sin_cpu.h         |  21 +++
 src/infiniop/ops/sin/cuda/kernel.cuh       |  39 +++++
 src/infiniop/ops/sin/metax/sin_metax.h     |   8 +
 src/infiniop/ops/sin/metax/sin_metax.maca  |  60 ++++++++
 src/infiniop/ops/sin/nvidia/sin_nvidia.cu  |  59 ++++++++
 src/infiniop/ops/sin/nvidia/sin_nvidia.cuh |   8 +
 src/infiniop/ops/sin/operator.cc           | 142 ++++++++++++++++++
 test/infiniop/sin.py                       | 166 +++++++++++++++++++++
 11 files changed, 693 insertions(+)
 create mode 100644 include/infiniop/ops/sin.h
 create mode 100644 src/infiniop-test/src/ops/sin.cpp
 create mode 100644 src/infiniop/ops/sin/cpu/sin_cpu.cc
 create mode 100644 src/infiniop/ops/sin/cpu/sin_cpu.h
 create mode 100644 src/infiniop/ops/sin/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sin/metax/sin_metax.h
 create mode 100644 src/infiniop/ops/sin/metax/sin_metax.maca
 create mode 100644 src/infiniop/ops/sin/nvidia/sin_nvidia.cu
 create mode 100644 src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
 create mode 100644 src/infiniop/ops/sin/operator.cc
 create mode 100644 test/infiniop/sin.py

diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h
new file mode 100644
index 000000000..640deccc0
--- /dev/null
+++ b/include/infiniop/ops/sin.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SIN_API_H__
+#define __INFINIOP_SIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle,
+                                                        infiniopSinDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp
new file mode 100644
index 000000000..db256c283
--- /dev/null
+++ b/src/infiniop-test/src/ops/sin.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::sin {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSinDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateSinDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSinWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSin(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSin(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "output", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"output"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::sin
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc
new file mode 100644
index 000000000..88ba6cdd6
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc
@@ -0,0 +1,52 @@
+#include "sin_cpu.h"
+
+namespace op::sin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SinOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SinOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SinOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::cpu
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h
new file mode 100644
index 000000000..e221c2573
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __SIN_CPU_H__
+#define __SIN_CPU_H__
+
+#include <cmath>
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sin, cpu)
+
+namespace op::sin::cpu {
+typedef struct SinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::sin(input);
+    }
+} SinOp;
+} // namespace op::sin::cpu
+
+#endif // __SIN_CPU_H__
diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh
new file mode 100644
index 000000000..c9993ca12
--- /dev/null
+++ b/src/infiniop/ops/sin/cuda/kernel.cuh
@@ -0,0 +1,39 @@
+#ifndef __SIN_CUDA_H__
+#define __SIN_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+
+namespace op::sin::cuda {
+typedef struct SinOp {
+  static constexpr size_t num_inputs = 1;
+
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T &input) const {
+    if constexpr (std::is_same_v<T, half2>) {
+        float2 vf = __half22float2(input);
+        float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y));
+        return __float22half2_rn(vr);
+    } else if constexpr (std::is_same_v<T, half>) {
+        float inputf = __half2float(input);
+        return __float2half_rn(sinf(inputf));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+        float f0 = __bfloat162float(__low2bfloat16(input));
+        float f1 = __bfloat162float(__high2bfloat16(input));
+        return __floats2bfloat162_rn(__sinf(f0), __sinf(f1));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        float inputf = __bfloat162float(input);
+        return __float2bfloat16_rn(__sinf(inputf));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return sinf(input);
+    } else if constexpr (std::is_same_v<T, double>) {
+        return std::sin(input);
+    } else {
+        return std::sin(input);
+    }
+  }
+} SinOp;
+} // namespace op::sin::cuda
+
+#endif // __SIN_CUDA_H__
diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h
new file mode 100644
index 000000000..5b272d4d9
--- /dev/null
+++ b/src/infiniop/ops/sin/metax/sin_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SIN_METAX_API_H__
+#define __SIN_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(sin, metax)
+
+#endif // __SIN_METAX_API_H__
diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca
new file mode 100644
index 000000000..5ea69e139
--- /dev/null
+++ b/src/infiniop/ops/sin/metax/sin_metax.maca
@@ -0,0 +1,60 @@
+#include "sin_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::sin::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::metax
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
new file mode 100644
index 000000000..eaac7a582
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sin_nvidia.cuh"
+
+namespace op::sin::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::nvidia
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
new file mode 100644
index 000000000..31f5b48ef
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIN_CUDA_API_H__
+#define __SIN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sin, nvidia)
+
+#endif // __SIN_CUDA_API_H__
diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc
new file mode 100644
index 000000000..38d8b242c
--- /dev/null
+++ b/src/infiniop/ops/sin/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/sin_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/sin_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSinDescriptor(
+    infiniopHandle_t handle,
+    infiniopSinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::sin::NAMESPACE::Descriptor::create(                      \
+            handle,                                                         \
+            reinterpret_cast<op::sin::NAMESPACE::Descriptor **>(desc_ptr),  \
+            output_desc,                                                    \
+            {input_desc})                                                   \
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::sin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSin(
+    infiniopSinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+    delete reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py
new file mode 100644
index 000000000..613257e9c
--- /dev/null
+++ b/test/infiniop/sin.py
@@ -0,0 +1,166 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def sin(output, input):
+    output.copy_(torch.sin(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    sin(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSinDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSinWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_sin():
+        check_error(
+            LIBINFINIOP.infiniopSin(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_sin()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sin(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 0ad8416e4ba757263aea672bce7190753f301984 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:09:56 +0800
Subject: [PATCH 04/16] [T1-1-1]: Cos operator with cpu nvidia metax iluvatar
 and test

---
 include/infiniop/ops/cos.h                 |  24 +++
 src/infiniop-test/src/ops/cos.cpp          | 114 ++++++++++++++
 src/infiniop/ops/cos/cpu/cos_cpu.cc        |  52 +++++++
 src/infiniop/ops/cos/cpu/cos_cpu.h         |  21 +++
 src/infiniop/ops/cos/cuda/kernel.cuh       |  49 ++++++
 src/infiniop/ops/cos/metax/cos_metax.h     |   8 +
 src/infiniop/ops/cos/metax/cos_metax.maca  |  60 ++++++++
 src/infiniop/ops/cos/nvidia/cos_nvidia.cu  |  59 ++++++++
 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh |   8 +
 src/infiniop/ops/cos/operator.cc           | 142 ++++++++++++++++++
 test/infiniop/cos.py                       | 166 +++++++++++++++++++++
 11 files changed, 703 insertions(+)
 create mode 100644 include/infiniop/ops/cos.h
 create mode 100644 src/infiniop-test/src/ops/cos.cpp
 create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.cc
 create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.h
 create mode 100644 src/infiniop/ops/cos/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/cos/metax/cos_metax.h
 create mode 100644 src/infiniop/ops/cos/metax/cos_metax.maca
 create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cu
 create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
 create mode 100644 src/infiniop/ops/cos/operator.cc
 create mode 100644 test/infiniop/cos.py

diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h
new file mode 100644
index 000000000..aeb551e77
--- /dev/null
+++ b/include/infiniop/ops/cos.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_COS_API_H__
+#define __INFINIOP_COS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle,
+                                                        infiniopCosDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/cos.cpp b/src/infiniop-test/src/ops/cos.cpp
new file mode 100644
index 000000000..7cae4574d
--- /dev/null
+++ b/src/infiniop-test/src/ops/cos.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cos {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopCosDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateCosDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCosWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopCos(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopCos(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "output", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"output"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::cos
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc
new file mode 100644
index 000000000..f5d27ec49
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc
@@ -0,0 +1,52 @@
+#include "cos_cpu.h"
+
+namespace op::cos::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CosOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<CosOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<CosOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::cpu
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h
new file mode 100644
index 000000000..37efb7597
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __COS_CPU_H__
+#define __COS_CPU_H__
+
+#include <cmath>
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(cos, cpu)
+
+namespace op::cos::cpu {
+typedef struct CosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::cos(input);
+    }
+} CosOp;
+} // namespace op::cos::cpu
+
+#endif // __COS_CPU_H__
diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh
new file mode 100644
index 000000000..381a897f0
--- /dev/null
+++ b/src/infiniop/ops/cos/cuda/kernel.cuh
@@ -0,0 +1,49 @@
+#ifndef __COS_CUDA_H__
+#define __COS_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+
+namespace op::cos::cuda {
+typedef struct CosOp {
+  static constexpr size_t num_inputs = 1;
+
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T &input) const {
+    auto cos_f32 = [] __device__ (float x) {
+        double xd = static_cast<double>(x);
+        double yd = std::cos(xd);
+        return static_cast<float>(yd);
+    };
+
+    if constexpr (std::is_same_v<T, half2>) {
+        float2 vf = __half22float2(input);
+        float2 vr = make_float2(
+          cos_f32(vf.x),
+          cos_f32(vf.y)
+        );
+        return __float22half2_rn(vr);
+    } else if constexpr (std::is_same_v<T, half>) {
+        float xf = __half2float(input);
+        float yf = cos_f32(xf);
+        return __float2half_rn(yf);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+        float f0 = __bfloat162float(__low2bfloat16(input));
+        float f1 = __bfloat162float(__high2bfloat16(input));
+        return __floats2bfloat162_rz(cos_f32(f0), cos_f32(f1));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        float xf = __bfloat162float(input);
+        return __float2bfloat16_rz(cos_f32(xf));
+    } else if constexpr (std::is_same_v<T, float>) {
+        return cos_f32(input);
+    } else if constexpr (std::is_same_v<T, double>) {
+        return std::cos(input);
+    } else {
+        return std::cos(input);
+    }
+  }
+} CosOp;
+} // namespace op::cos::cuda
+
+#endif // __COS_CUDA_H__
diff --git a/src/infiniop/ops/cos/metax/cos_metax.h b/src/infiniop/ops/cos/metax/cos_metax.h
new file mode 100644
index 000000000..a98fa3211
--- /dev/null
+++ b/src/infiniop/ops/cos/metax/cos_metax.h
@@ -0,0 +1,8 @@
+#ifndef __COS_METAX_API_H__
+#define __COS_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(cos, metax)
+
+#endif // __COS_METAX_API_H__
diff --git a/src/infiniop/ops/cos/metax/cos_metax.maca b/src/infiniop/ops/cos/metax/cos_metax.maca
new file mode 100644
index 000000000..144db47ef
--- /dev/null
+++ b/src/infiniop/ops/cos/metax/cos_metax.maca
@@ -0,0 +1,60 @@
+#include "cos_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::cos::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::metax
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
new file mode 100644
index 000000000..a3c38bc89
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cos_nvidia.cuh"
+
+namespace op::cos::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::nvidia
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
new file mode 100644
index 000000000..f6c350dd6
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __COS_CUDA_API_H__
+#define __COS_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(cos, nvidia)
+
+#endif // __COS_CUDA_API_H__
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
new file mode 100644
index 000000000..11781d591
--- /dev/null
+++ b/src/infiniop/ops/cos/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cos.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cos_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cos_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cos_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCosDescriptor(
+    infiniopHandle_t handle,
+    infiniopCosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::cos::NAMESPACE::Descriptor::create(                      \
+            handle,                                                         \
+            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr),  \
+            output_desc,                                                    \
+            {input_desc})                                                   \
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::cos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCos(
+    infiniopCosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+    delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py
new file mode 100644
index 000000000..d1d94db3a
--- /dev/null
+++ b/test/infiniop/cos.py
@@ -0,0 +1,166 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def cos(output, input):
+    output.copy_(torch.cos(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    cos(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCosDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCosWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_cos():
+        check_error(
+            LIBINFINIOP.infiniopCos(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_cos()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cos(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From b41cbd2bd637dded8c9c18144ffd007273f22cb9 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:11:08 +0800
Subject: [PATCH 05/16] [T1-1-1]: Leakyrelu operator with cpu nvidia metax
 iluvatar and test

---
 include/infiniop/ops/leakyrelu.h              |  25 +++
 src/infiniop-test/src/ops/leakyrelu.cpp       | 123 ++++++++++++
 .../ops/leakyrelu/cpu/leakyrelu_cpu.cc        | 104 ++++++++++
 .../ops/leakyrelu/cpu/leakyrelu_cpu.h         |   7 +
 src/infiniop/ops/leakyrelu/cuda/kernel.cuh    |  69 +++++++
 src/infiniop/ops/leakyrelu/info.h             |  52 +++++
 src/infiniop/ops/leakyrelu/leakyrelu.h        |  49 +++++
 .../ops/leakyrelu/metax/leakyrelu_metax.h     |   8 +
 .../ops/leakyrelu/metax/leakyrelu_metax.maca  | 174 +++++++++++++++++
 .../ops/leakyrelu/nvidia/leakyrelu_nvidia.cu  | 178 ++++++++++++++++++
 .../ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh |   8 +
 src/infiniop/ops/leakyrelu/operator.cc        | 164 ++++++++++++++++
 test/infiniop/leakyrelu.py                    | 168 +++++++++++++++++
 13 files changed, 1129 insertions(+)
 create mode 100644 include/infiniop/ops/leakyrelu.h
 create mode 100644 src/infiniop-test/src/ops/leakyrelu.cpp
 create mode 100644 src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc
 create mode 100644 src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h
 create mode 100644 src/infiniop/ops/leakyrelu/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/leakyrelu/info.h
 create mode 100644 src/infiniop/ops/leakyrelu/leakyrelu.h
 create mode 100644 src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h
 create mode 100644 src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca
 create mode 100644 src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu
 create mode 100644 src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh
 create mode 100644 src/infiniop/ops/leakyrelu/operator.cc
 create mode 100644 test/infiniop/leakyrelu.py

diff --git a/include/infiniop/ops/leakyrelu.h b/include/infiniop/ops/leakyrelu.h
new file mode 100644
index 000000000..9ce93d53c
--- /dev/null
+++ b/include/infiniop/ops/leakyrelu.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_LEAKYRELU_API_H__
+#define __INFINIOP_LEAKYRELU_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLeakyreluDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLeakyreluDescriptor(infiniopHandle_t handle,
+                                                        infiniopLeakyreluDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input,
+                                                        float negative_slope);
+
+__C __export infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/leakyrelu.cpp b/src/infiniop-test/src/ops/leakyrelu.cpp
new file mode 100644
index 000000000..c63741120
--- /dev/null
+++ b/src/infiniop-test/src/ops/leakyrelu.cpp
@@ -0,0 +1,123 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::leakyrelu {
+struct Test::Attributes {
+    float negative_slope;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (attributes.find("negative_slope") == attributes.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->negative_slope = *reinterpret_cast<float *>(attributes["negative_slope"].data());
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLeakyreluDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateLeakyreluDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc(),
+                                         _attributes->negative_slope),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+             
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLeakyreluWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
+    }
+    CHECK_OR(infiniopLeakyrelu(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLeakyrelu(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {"negative_slope"}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "output", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"output"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- negative_slope=" << _attributes->negative_slope << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::leakyrelu
diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc
new file mode 100644
index 000000000..cd56f0ca6
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc
@@ -0,0 +1,104 @@
+#include "leakyrelu_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../info.h"
+#include "infinicore.h"
+#include <algorithm>
+
+namespace op::leakyrelu::cpu {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    float negative_slope) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope);
+    CHECK_RESULT(info_r);
+
+    *desc_ptr = new Descriptor(
+        info_r.take(),
+        0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const { return _min_workspace_size; }
+
+template <typename T>
+static inline void cpu_leakyrelu_impl_incremental(
+    void *output, const void *input, const op::leakyrelu::LeakyReLUInfo &info) {
+
+    const size_t ndim = info.shape.size();
+    const size_t n = info.n;
+
+    if (n == 0) return;
+
+    auto out_base = reinterpret_cast<T *>(output);
+    auto in_base = reinterpret_cast<const T *>(input);
+
+    const std::vector<size_t> &shape = info.shape;
+    const std::vector<ptrdiff_t> &in_stride = info.in_stride;
+    const std::vector<ptrdiff_t> &out_stride = info.out_stride;
+
+    std::vector<size_t> idx(ndim, 0);
+    ptrdiff_t in_off = 0;
+    ptrdiff_t out_off = 0;
+
+    for (size_t it = 0; it < n; ++it) {
+        const T *in_elem = in_base + in_off;
+        T *out_elem = out_base + out_off;
+
+        float v = utils::cast<float, T>(*in_elem);
+        float outv = v >= 0.0f ? v : v * info.negative_slope;
+        *out_elem = utils::cast<T, float>(outv);
+        for (int d = static_cast<int>(ndim) - 1; d >= 0; --d) {
+            idx[d] += 1;
+            if (in_stride[d] != 0) in_off += in_stride[d];
+            if (out_stride[d] != 0) out_off += out_stride[d];
+
+            if (idx[d] < shape[d]) {
+                break;
+            } else {
+                idx[d] = 0;
+                if (in_stride[d] != 0) in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
+                if (out_stride[d] != 0) out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    switch (_info.dt_in) {     
+    case INFINI_DTYPE_F16:
+        cpu_leakyrelu_impl_incremental<fp16_t>(output, input, _info);
+        break;             
+    case INFINI_DTYPE_BF16:
+        cpu_leakyrelu_impl_incremental<bf16_t>(output, input, _info);
+        break;             
+    case INFINI_DTYPE_F32: 
+        cpu_leakyrelu_impl_incremental<float>(output, input, _info);
+        break;            
+    case INFINI_DTYPE_F64: 
+        cpu_leakyrelu_impl_incremental<double>(output, input, _info);
+        break;            
+    default:               
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;    
+    }                      
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::leakyrelu::cpu
diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h
new file mode 100644
index 000000000..e58ca1409
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __LEAKYRELU_CPU_H__
+#define __LEAKYRELU_CPU_H__
+#include "../leakyrelu.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __LEAKYRELU_CPU_H__
diff --git a/src/infiniop/ops/leakyrelu/cuda/kernel.cuh b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh
new file mode 100644
index 000000000..abad71b6a
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh
@@ -0,0 +1,69 @@
+#ifndef __LEAKYRELU_CUDA_KERNEL_CUH__
+#define __LEAKYRELU_CUDA_KERNEL_CUH__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <stdint.h>
+#include <type_traits>
+
+template <typename DevT>
+__device__ __forceinline__ float to_float_for_leaky(const DevT &v) {
+    if constexpr (std::is_same_v<DevT, half>) {
+        return __half2float(v);
+    } else if constexpr (std::is_same_v<DevT, __nv_bfloat16>) {
+        return __bfloat162float(v);
+    } else {
+        return static_cast<float>(v);
+    }
+}
+
+template <typename DevT>
+__device__ __forceinline__ DevT from_float_for_leaky(float f) {
+    if constexpr (std::is_same_v<DevT, half>) {
+        return __float2half_rn(f);
+    } else if constexpr (std::is_same_v<DevT, __nv_bfloat16>) {
+        return __float2bfloat16(f);
+    } else {
+        return static_cast<DevT>(f);
+    }
+}
+
+template <class DevT>
+__global__ void leakyrelu_kernel(
+    DevT *__restrict__ out,
+    const DevT *__restrict__ in,
+    size_t n,
+    float negative_slope,
+    const size_t *__restrict__ shape,
+    const size_t *__restrict__ div,
+    const long long *__restrict__ in_stride,
+    const long long *__restrict__ out_stride,
+    int ndim) {
+
+    size_t gid = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t grid_stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+
+    for (size_t linear = gid; linear < n; linear += grid_stride) {
+        unsigned long long rem = linear;
+        long long in_off = 0;
+        long long out_off = 0;
+        for (int d = 0; d < ndim; ++d) {
+            unsigned long long idx_d = 0;
+            size_t divisor = div[d];
+            if (divisor != 0) {
+                idx_d = rem / divisor;
+                rem = rem % divisor;
+            } else {
+                idx_d = 0;
+            }
+            if (in_stride[d] != 0) in_off += static_cast<long long>(idx_d) * in_stride[d];
+            if (out_stride[d] != 0) out_off += static_cast<long long>(idx_d) * out_stride[d];
+        }
+
+        float v = to_float_for_leaky(in[static_cast<size_t>(in_off)]);
+        float outv = v >= 0.0f ? v : v * negative_slope;
+        out[static_cast<size_t>(out_off)] = from_float_for_leaky<DevT>(outv);
+    }
+}
+
+#endif // __LEAKYRELU_CUDA_KERNEL_CUH__
diff --git a/src/infiniop/ops/leakyrelu/info.h b/src/infiniop/ops/leakyrelu/info.h
new file mode 100644
index 000000000..dd0a2d3ad
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/info.h
@@ -0,0 +1,52 @@
+#ifndef __LEAKYRELU_INFO_H__
+#define __LEAKYRELU_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::leakyrelu {
+
+class LeakyReLUInfo {
+    LeakyReLUInfo() = default;
+
+public:
+    infiniDtype_t dt_in;
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> in_stride;
+    std::vector<ptrdiff_t> out_stride;
+    size_t n;
+    float negative_slope;
+
+    static utils::Result<LeakyReLUInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc,
+        float negative_slope) {
+
+        auto dt_raw = in_desc->dtype();
+        infiniDtype_t dt_in = dt_raw;
+
+        CHECK_DTYPE(dt_in, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(out_desc->ndim() == in_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        for (size_t i = 0; i < out_desc->ndim(); ++i) {
+            CHECK_OR_RETURN(out_desc->dim(i) == in_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+
+        size_t n = 1;
+        for (size_t i = 0; i < in_desc->ndim(); ++i) n *= static_cast<size_t>(in_desc->dim(i));
+
+        return utils::Result<LeakyReLUInfo>(LeakyReLUInfo{
+            dt_in,
+            out_desc->shape(),
+            in_desc->strides(),
+            out_desc->strides(),
+            n,
+            negative_slope
+        });
+    }
+};
+
+} // namespace op::leakyrelu
+
+#endif // __LEAKYRELU_INFO_H__
diff --git a/src/infiniop/ops/leakyrelu/leakyrelu.h b/src/infiniop/ops/leakyrelu/leakyrelu.h
new file mode 100644
index 000000000..a6a01a85b
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/leakyrelu.h
@@ -0,0 +1,49 @@
+#ifndef __LEAKYRELU_H__
+#define __LEAKYRELU_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                             \
+                                                          \
+    namespace op::leakyrelu::NAMESPACE {                  \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+                                                          \
+        LeakyReLUInfo _info;                              \
+        size_t _min_workspace_size;                       \
+                                                          \
+        Descriptor(                                       \
+            LeakyReLUInfo info,                           \
+            size_t min_workspace_size,                    \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _info(info),                                \
+              _min_workspace_size(min_workspace_size) {}  \
+                                                          \
+    public:                                               \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t out_desc,          \
+            infiniopTensorDescriptor_t in_desc,           \
+            float negative_slope);                        \
+                                                          \
+        size_t workspaceSize() const;                     \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *workspace,                              \
+            size_t workspace_size,                        \
+            void *output,                                 \
+            const void *input,                            \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+#endif // __LEAKYRELU_H__
diff --git a/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h
new file mode 100644
index 000000000..15cdccc61
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LEAKYRELU_METAX_API_H__
+#define __LEAKYRELU_METAX_API_H__
+
+#include "../leakyrelu.h"
+
+DESCRIPTOR(metax)
+
+#endif // __LEAKYRELU_METAX_API_H__
diff --git a/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca
new file mode 100644
index 000000000..871c3f663
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca
@@ -0,0 +1,174 @@
+#include "../cuda/kernel.cuh"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../leakyrelu.h"
+#include "leakyrelu_metax.h"
+#include "../info.h"
+
+namespace op::leakyrelu::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+template <typename T> struct MapHcType { using Type = T; };
+template <> struct MapHcType<fp16_t> { using Type = half; };
+#if defined(__HC_BF16_TYPES_EXIST__) || defined(__HC_ARCH__)
+template <> struct MapHcType<bf16_t> { using Type = __nv_bfloat16; };
+#endif
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    float negative_slope) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope);
+    CHECK_RESULT(info_r);
+    auto info = info_r.take();
+
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <typename T>
+static inline infiniStatus_t metax_leakyrelu_impl_incremental(
+    void *output_, const void *input_, 
+    const op::leakyrelu::LeakyReLUInfo &info, 
+    void *stream_) {
+
+    int bs = 256, grid = 0;
+    hcError_t propErr;
+    int device_id_local = 0;
+    using DevT = typename MapHcType<T>::Type;
+
+    auto out_dev = reinterpret_cast<DevT *>(output_);
+    auto in_dev  = reinterpret_cast<const DevT *>(input_);
+    auto stream = reinterpret_cast<hcStream_t>(stream_);
+
+    int ndim = static_cast<int>(info.shape.size());
+    if (ndim == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    std::vector<size_t> h_shape(info.shape.begin(), info.shape.end());
+    std::vector<size_t> h_div(ndim);
+    h_div[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+        h_div[d] = h_div[d + 1] * h_shape[d + 1];
+    }
+
+    std::vector<long long> h_in_stride(ndim), h_out_stride(ndim);
+    for (int d = 0; d < ndim; ++d) {
+        h_in_stride[d] = static_cast<long long>(info.in_stride[d]);
+        h_out_stride[d] = static_cast<long long>(info.out_stride[d]);
+    }
+
+    size_t *d_shape = nullptr;
+    size_t *d_div = nullptr;
+    long long *d_in_stride = nullptr;
+    long long *d_out_stride = nullptr;
+
+    hcError_t err = hcSuccess;
+
+    err = hcMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+
+    device_id_local = 0;
+    propErr = hcGetDevice(&device_id_local);
+    if (propErr == hcSuccess) {
+        hcDeviceProp_t prop;
+        if (hcGetDeviceProperties(&prop, device_id_local) == hcSuccess) {
+            bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
+        } else {
+            if (bs > 256) bs = 256;
+        }
+    } else {
+        if (bs > 256) bs = 256;
+    }
+
+    if (bs <= 0) bs = 256;
+    grid = static_cast<int>((info.n + bs - 1) / bs);
+    if (grid <= 0) grid = 1;
+
+    leakyrelu_kernel<DevT><<<grid, bs, 0, stream>>>(
+        out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim);
+
+    err = hcGetLastError();
+    if (err != hcSuccess) goto cleanup;
+
+    err = hcStreamSynchronize(stream);
+    if (err != hcSuccess) goto cleanup;
+
+    hcFree(d_shape);
+    hcFree(d_div);
+    hcFree(d_in_stride);
+    hcFree(d_out_stride);
+    return INFINI_STATUS_SUCCESS;
+
+cleanup:
+    hcFree(d_shape);
+    hcFree(d_div);
+    hcFree(d_in_stride);
+    hcFree(d_out_stride);
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    switch (_info.dt_in) {     
+    case INFINI_DTYPE_F16:
+        metax_leakyrelu_impl_incremental<fp16_t>(output, input, _info, stream);
+        break;             
+    case INFINI_DTYPE_BF16:
+        metax_leakyrelu_impl_incremental<cuda_bfloat16>(output, input, _info, stream);
+        break;             
+    case INFINI_DTYPE_F32: 
+        metax_leakyrelu_impl_incremental<float>(output, input, _info, stream);
+        break;            
+    case INFINI_DTYPE_F64: 
+        metax_leakyrelu_impl_incremental<double>(output, input, _info, stream);
+        break;            
+    default:               
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;    
+    }                      
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace op::leakyrelu::metax
diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu
new file mode 100644
index 000000000..05d149d5e
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu
@@ -0,0 +1,178 @@
+#include "../cuda/kernel.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../leakyrelu.h"
+#include "leakyrelu_nvidia.cuh"
+#include "../info.h"
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <vector>
+#include <cstring>
+
+namespace op::leakyrelu::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+template <typename T> struct MapCudaType { using Type = T; };
+template <> struct MapCudaType<fp16_t> { using Type = half; };
+#if defined(__CUDA_BF16_TYPES_EXIST__) || defined(__CUDA_ARCH__)
+template <> struct MapCudaType<bf16_t> { using Type = __nv_bfloat16; };
+#endif
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    float negative_slope) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope);
+    CHECK_RESULT(info_r);
+    auto info = info_r.take();
+
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <typename T>
+static inline infiniStatus_t cuda_leakyrelu_impl_incremental(
+    void *output_, const void *input_, 
+    const op::leakyrelu::LeakyReLUInfo &info, 
+    void *stream_) {
+
+    int bs = 256, grid = 0;
+    cudaError_t propErr;
+    int device_id_local = 0;
+    using DevT = typename MapCudaType<T>::Type;
+
+    auto out_dev = reinterpret_cast<DevT *>(output_);
+    auto in_dev  = reinterpret_cast<const DevT *>(input_);
+    auto stream = reinterpret_cast<cudaStream_t>(stream_);
+
+    int ndim = static_cast<int>(info.shape.size());
+    if (ndim == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    std::vector<size_t> h_shape(info.shape.begin(), info.shape.end());
+    std::vector<size_t> h_div(ndim);
+    h_div[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+        h_div[d] = h_div[d + 1] * h_shape[d + 1];
+    }
+
+    std::vector<long long> h_in_stride(ndim), h_out_stride(ndim);
+    for (int d = 0; d < ndim; ++d) {
+        h_in_stride[d] = static_cast<long long>(info.in_stride[d]);
+        h_out_stride[d] = static_cast<long long>(info.out_stride[d]);
+    }
+
+    size_t *d_shape = nullptr;
+    size_t *d_div = nullptr;
+    long long *d_in_stride = nullptr;
+    long long *d_out_stride = nullptr;
+
+    cudaError_t err = cudaSuccess;
+
+    err = cudaMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) goto cleanup;
+
+    device_id_local = 0;
+    propErr = cudaGetDevice(&device_id_local);
+    if (propErr == cudaSuccess) {
+        cudaDeviceProp prop;
+        if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) {
+            bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
+        } else {
+            if (bs > 256) bs = 256;
+        }
+    } else {
+        if (bs > 256) bs = 256;
+    }
+
+    if (bs <= 0) bs = 256;
+    grid = static_cast<int>((info.n + bs - 1) / bs);
+    if (grid <= 0) grid = 1;
+
+    leakyrelu_kernel<DevT><<<grid, bs, 0, stream>>>(
+        out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim);
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) goto cleanup;
+
+    err = cudaStreamSynchronize(stream);
+    if (err != cudaSuccess) goto cleanup;
+
+    cudaFree(d_shape);
+    cudaFree(d_div);
+    cudaFree(d_in_stride);
+    cudaFree(d_out_stride);
+    return INFINI_STATUS_SUCCESS;
+
+cleanup:
+    cudaFree(d_shape);
+    cudaFree(d_div);
+    cudaFree(d_in_stride);
+    cudaFree(d_out_stride);
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    switch (_info.dt_in) {     
+    case INFINI_DTYPE_F16:
+        cuda_leakyrelu_impl_incremental<fp16_t>(output, input, _info, stream);
+        break;             
+    case INFINI_DTYPE_BF16:
+        cuda_leakyrelu_impl_incremental<cuda_bfloat16>(output, input, _info, stream);
+        break;             
+    case INFINI_DTYPE_F32: 
+        cuda_leakyrelu_impl_incremental<float>(output, input, _info, stream);
+        break;            
+    case INFINI_DTYPE_F64: 
+        cuda_leakyrelu_impl_incremental<double>(output, input, _info, stream);
+        break;            
+    default:               
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;    
+    }                      
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace op::leakyrelu::nvidia
diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh
new file mode 100644
index 000000000..fb891a6c9
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LEAKYRELU_CUDA_API_H__
+#define __LEAKYRELU_CUDA_API_H__
+
+#include "../leakyrelu.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __LEAKYRELU_CUDA_API_H__
diff --git a/src/infiniop/ops/leakyrelu/operator.cc b/src/infiniop/ops/leakyrelu/operator.cc
new file mode 100644
index 000000000..ad6d504a8
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/operator.cc
@@ -0,0 +1,164 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/leakyrelu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/leakyrelu_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/leakyrelu_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/leakyrelu_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLeakyreluDescriptor(
+    infiniopHandle_t handle,
+    infiniopLeakyreluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    float negative_slope) {
+
+#define CREATE_LEAKY(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                          \
+        return op::leakyrelu::NAMESPACE::Descriptor::create(                            \
+            handle,                                                                     \
+            reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor **>(desc_ptr),        \
+            y_desc,                                                                     \
+            x_desc,                                                                     \
+            negative_slope)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE_LEAKY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE_LEAKY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE_LEAKY(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE_LEAKY(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE_LEAKY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE_LEAKY(INFINI_DEVICE_MOORE, musa);
+#endif
+    }
+
+#undef CREATE_LEAKY
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                  \
+        *size = reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, musa);
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, void *workspace, size_t workspace_size,
+                                     void *y, const void *x, void *stream) {
+
+#define CALC_LEAKY(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                        \
+        return reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALC_LEAKY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALC_LEAKY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALC_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALC_LEAKY(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALC_LEAKY(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CALC_LEAKY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALC_LEAKY(INFINI_DEVICE_MOORE, musa);
+#endif
+    }
+
+#undef CALC_LEAKY
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc) {
+
+#define DESTROY_LEAKY(CASE, NAMESPACE)                                              \
+    case CASE:                                                                      \
+        delete reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc);     \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY_LEAKY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY_LEAKY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DESTROY_LEAKY(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DESTROY_LEAKY(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY_LEAKY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY_LEAKY(INFINI_DEVICE_MOORE, musa);
+#endif
+    }
+
+#undef DESTROY_LEAKY
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/test/infiniop/leakyrelu.py b/test/infiniop/leakyrelu.py
new file mode 100644
index 000000000..93a8170d2
--- /dev/null
+++ b/test/infiniop/leakyrelu.py
@@ -0,0 +1,168 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def leakyrelu(output, input, negative_slope):
+    output.copy_(torch.where(input >= 0, input, input * negative_slope))
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+    
+    negative_slope = 0.1
+    print(
+        f"Testing Leakyrelu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} negative_slope:{negative_slope} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    leakyrelu(output.torch_tensor(), input.torch_tensor(), negative_slope)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLeakyreluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+            negative_slope
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLeakyreluWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_leakyrelu():
+        check_error(
+            LIBINFINIOP.infiniopLeakyrelu(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None
+            )
+        )
+
+    lib_leakyrelu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: leakyrelu(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_leakyrelu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLeakyreluDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From a8683cd498192f08ba2f308f7243ea2f3a3a97e2 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:11:57 +0800
Subject: [PATCH 06/16] [T1-1-1]: Tanh operator with cpu nvidia metax iluvatar
 and test

---
 include/infiniop/ops/tanh.h                  |  24 +++
 src/infiniop-test/src/ops/tanh.cpp           | 114 +++++++++++++
 src/infiniop/ops/tanh/cpu/tanh_cpu.cc        |  52 ++++++
 src/infiniop/ops/tanh/cpu/tanh_cpu.h         |  21 +++
 src/infiniop/ops/tanh/cuda/kernel.cuh        |  46 +++++
 src/infiniop/ops/tanh/metax/tanh_metax.h     |   8 +
 src/infiniop/ops/tanh/metax/tanh_metax.maca  |  60 +++++++
 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu  |  59 +++++++
 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh |   8 +
 src/infiniop/ops/tanh/operator.cc            | 142 ++++++++++++++++
 test/infiniop/tanh.py                        | 166 +++++++++++++++++++
 11 files changed, 700 insertions(+)
 create mode 100644 include/infiniop/ops/tanh.h
 create mode 100644 src/infiniop-test/src/ops/tanh.cpp
 create mode 100644 src/infiniop/ops/tanh/cpu/tanh_cpu.cc
 create mode 100644 src/infiniop/ops/tanh/cpu/tanh_cpu.h
 create mode 100644 src/infiniop/ops/tanh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/tanh/metax/tanh_metax.h
 create mode 100644 src/infiniop/ops/tanh/metax/tanh_metax.maca
 create mode 100644 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
 create mode 100644 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
 create mode 100644 src/infiniop/ops/tanh/operator.cc
 create mode 100644 test/infiniop/tanh.py

diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h
new file mode 100644
index 000000000..62974e951
--- /dev/null
+++ b/include/infiniop/ops/tanh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_TANH_API_H__
+#define __INFINIOP_TANH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle,
+                                                        infiniopTanhDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp
new file mode 100644
index 000000000..bb8c6b081
--- /dev/null
+++ b/src/infiniop-test/src/ops/tanh.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::tanh {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopTanhDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopTanh(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "output", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"output"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::tanh
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc
new file mode 100644
index 000000000..23a92ed65
--- /dev/null
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc
@@ -0,0 +1,52 @@
+#include "tanh_cpu.h"
+
+namespace op::tanh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanhOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<TanhOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<TanhOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::cpu
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
new file mode 100644
index 000000000..5dc73b383
--- /dev/null
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __TANH_CPU_H__
+#define __TANH_CPU_H__
+
+#include <cmath>
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(tanh, cpu)
+
+namespace op::tanh::cpu {
+typedef struct TanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::tanh(input);
+    }
+} TanhOp;
+} // namespace op::tanh::cpu
+
+#endif // __TANH_CPU_H__
diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh
new file mode 100644
index 000000000..49605aa93
--- /dev/null
+++ b/src/infiniop/ops/tanh/cuda/kernel.cuh
@@ -0,0 +1,46 @@
+#ifndef __TANH_CUDA_H__
+#define __TANH_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+
+namespace op::tanh::cuda {
+typedef struct TanhOp {
+  static constexpr size_t num_inputs = 1;
+
+  __device__ __forceinline__ float tanh_f32_func(float x) const {
+    return tanhf(x);
+  }
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T &input) const {
+    if constexpr (std::is_same_v<T, half2>) {
+        float2 vf = __half22float2(input);
+        float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
+        return __float22half2_rn(vr);
+    } else if constexpr (std::is_same_v<T, half>) {
+        float xf = __half2float(input);
+        float yf = tanh_f32_func(xf);
+        return __float2half_rn(yf);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+        float f0 = __bfloat162float(__low2bfloat16(input));
+        float f1 = __bfloat162float(__high2bfloat16(input));
+        float r0 = tanh_f32_func(f0);
+        float r1 = tanh_f32_func(f1);
+        return __floats2bfloat162_rn(r0, r1);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        float xf = __bfloat162float(input);
+        float rf = tanh_f32_func(xf);
+        return __float2bfloat16_rn(rf);
+    } else if constexpr (std::is_same_v<T, float>) {
+        return tanh_f32_func(input);
+    } else if constexpr (std::is_same_v<T, double>) {
+        return std::tanh(input);
+    } else {
+        return std::tanh(input);
+    }
+  }
+} TanhOp;
+} // namespace op::tanh::cuda
+
+#endif // __TANH_CUDA_H__
diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h
new file mode 100644
index 000000000..8432a7f0d
--- /dev/null
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TANH_METAX_API_H__
+#define __TANH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanh, metax)
+
+#endif // __TANH_METAX_API_H__
diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca
new file mode 100644
index 000000000..0a01554c4
--- /dev/null
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca
@@ -0,0 +1,60 @@
+#include "tanh_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::tanh::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::metax
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
new file mode 100644
index 000000000..eeb6c85bf
--- /dev/null
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tanh_nvidia.cuh"
+
+namespace op::tanh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::nvidia
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
new file mode 100644
index 000000000..cb37b2528
--- /dev/null
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TANH_CUDA_API_H__
+#define __TANH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tanh, nvidia)
+
+#endif // __TANH_CUDA_API_H__
diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc
new file mode 100644
index 000000000..a5ed56f74
--- /dev/null
+++ b/src/infiniop/ops/tanh/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tanh_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tanh_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tanh::NAMESPACE::Descriptor::create(                      \
+            handle,                                                         \
+            reinterpret_cast<op::tanh::NAMESPACE::Descriptor **>(desc_ptr),  \
+            output_desc,                                                    \
+            {input_desc})                                                   \
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::tanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTanh(
+    infiniopTanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+    delete reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py
new file mode 100644
index 000000000..dc6ec46e8
--- /dev/null
+++ b/test/infiniop/tanh.py
@@ -0,0 +1,166 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def tanh(output, input):
+    output.copy_(torch.tanh(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    tanh(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTanhDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_tanh():
+        check_error(
+            LIBINFINIOP.infiniopTanh(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_tanh()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+        
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 8bb2121f57498e5f2d3e0f86c46c5263b13ff175 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:12:45 +0800
Subject: [PATCH 07/16] [T1-1-1]: Sigmoid_backward operator with cpu nvidia
 metax iluvatar and test

---
 include/infiniop/ops/sigmoid_backward.h       |  26 +++
 .../src/ops/sigmoid_backward.cpp              | 122 ++++++++++++
 .../cpu/sigmoid_backward_cpu.cc               |  54 +++++
 .../cpu/sigmoid_backward_cpu.h                |  31 +++
 .../ops/sigmoid_backward/cuda/kernel.cuh      |  62 ++++++
 .../metax/sigmoid_backward_metax.h            |   8 +
 .../metax/sigmoid_backward_metax.maca         |  62 ++++++
 .../nvidia/sigmoid_backward_nvidia.cu         |  61 ++++++
 .../nvidia/sigmoid_backward_nvidia.cuh        |   8 +
 src/infiniop/ops/sigmoid_backward/operator.cc | 145 ++++++++++++++
 test/infiniop/sigmoid_backward.py             | 184 ++++++++++++++++++
 11 files changed, 763 insertions(+)
 create mode 100644 include/infiniop/ops/sigmoid_backward.h
 create mode 100644 src/infiniop-test/src/ops/sigmoid_backward.cpp
 create mode 100644 src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc
 create mode 100644 src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
 create mode 100644 src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h
 create mode 100644 src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca
 create mode 100644 src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu
 create mode 100644 src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh
 create mode 100644 src/infiniop/ops/sigmoid_backward/operator.cc
 create mode 100644 test/infiniop/sigmoid_backward.py

diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h
new file mode 100644
index 000000000..2bcc5dee6
--- /dev/null
+++ b/include/infiniop/ops/sigmoid_backward.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_SIGMOID_BACKWARD_API_H__
+#define __INFINIOP_SIGMOID_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle,
+                                                        infiniopSigmoidBackwardDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t grad_input,
+                                                        infiniopTensorDescriptor_t input,
+                                                        infiniopTensorDescriptor_t grad_output);
+
+__C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *grad_input,
+                                        const void *input,
+                                        const void* grad_output,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/sigmoid_backward.cpp b/src/infiniop-test/src/ops/sigmoid_backward.cpp
new file mode 100644
index 000000000..116055300
--- /dev/null
+++ b/src/infiniop-test/src/ops/sigmoid_backward.cpp
@@ -0,0 +1,122 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::sigmoid_backward {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> grad_input;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("grad_output") == tensors.end()
+        || tensors.find("grad_input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->grad_input = tensors["grad_input"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSigmoidBackwardDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto grad_input = _attributes->grad_input->to(device, device_id);
+    CHECK_OR(infiniopCreateSigmoidBackwardDescriptor(handle, &op_desc,
+                                         grad_input->desc(),
+                                         input->desc(),
+                                         grad_output->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSigmoidBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSigmoidBackward(op_desc, workspace, workspace_size,
+                         grad_input->data(),
+                         input->data(),
+                         grad_output->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_input, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSigmoidBackward(
+                op_desc, workspace, workspace_size,
+                grad_input->data(),
+                input->data(),
+                grad_output->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "grad_output", "grad_input", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"grad_input"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- grad_input: " << _attributes->grad_input->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::sigmoid_backward
diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc
new file mode 100644
index 000000000..ea3d5e63c
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc
@@ -0,0 +1,54 @@
+#include "sigmoid_backward_cpu.h"
+
+namespace op::sigmoid_backward::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SigmoidBackwardOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SigmoidBackwardOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SigmoidBackwardOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SigmoidBackwardOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::cpu
diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
new file mode 100644
index 000000000..b2f87c2ea
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
@@ -0,0 +1,31 @@
+#ifndef __SIGMOID_BACKWARD_CPU_H__
+#define __SIGMOID_BACKWARD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, cpu)
+
+namespace op::sigmoid_backward::cpu {
+typedef struct SigmoidBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &x, const T &grad_out) const {
+        using ComputeT =
+            std::conditional_t<std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>,
+                               float, T>;
+        ComputeT xv = utils::cast<ComputeT, T>(x);
+        ComputeT gov = utils::cast<ComputeT, T>(grad_out);
+
+        // sigmoid(x) = 1 / (1 + exp(-x))
+        ComputeT s = static_cast<ComputeT>(1) / (static_cast<ComputeT>(1) + std::exp(-xv));
+
+        // grad_input = grad_output * s * (1 - s)
+        ComputeT gin = gov * s * (static_cast<ComputeT>(1) - s);
+
+        return utils::cast<T, ComputeT>(gin);
+    }
+} SigmoidBackwardOp;
+} // namespace op::sigmoid_backward::cpu
+
+#endif // __SIGMOID_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..6c10dd26e
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
@@ -0,0 +1,62 @@
+#ifndef __SIGMOID_BACKWARD_CUDA_H__
+#define __SIGMOID_BACKWARD_CUDA_H__
+
+#include <type_traits>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+
+namespace op::sigmoid_backward::cuda {
+typedef struct SigmoidBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, const T &grad_out) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 xf = __half22float2(x);
+            float2 gf = __half22float2(grad_out);
+            float2 sf;
+            sf.x = 1.0f / (1.0f + __expf(-xf.x));
+            sf.y = 1.0f / (1.0f + __expf(-xf.y));
+            float2 gr;
+            gr.x = gf.x * sf.x * (1.0f - sf.x);
+            gr.y = gf.y * sf.y * (1.0f - sf.y);
+            return __float22half2_rn(gr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(x);
+            float gf = __half2float(grad_out);
+            float s = 1.0f / (1.0f + __expf(-xf));
+            float gr = gf * s * (1.0f - s);
+            return __float2half_rn(gr);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(x));
+            float f1 = __bfloat162float(__high2bfloat16(x));
+            float g0 = __bfloat162float(__low2bfloat16(grad_out));
+            float g1 = __bfloat162float(__high2bfloat16(grad_out));
+            float s0 = 1.0f / (1.0f + __expf(-f0));
+            float s1 = 1.0f / (1.0f + __expf(-f1));
+            float r0 = g0 * s0 * (1.0f - s0);
+            float r1 = g1 * s1 * (1.0f - s1);
+            return __floats2bfloat162_rn(r0, r1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(x);
+            float gf = __bfloat162float(grad_out);
+            float s = 1.0f / (1.0f + __expf(-xf));
+            float gr = gf * s * (1.0f - s);
+            return __float2bfloat16_rn(gr);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float s = 1.0f / (1.0f + __expf(-x));
+            return grad_out * s * (1.0f - s);
+        } else if constexpr (std::is_same_v<T, double>) {
+            double s = 1.0 / (1.0 + std::exp(-x));
+            return grad_out * s * (1.0 - s);
+        } else {
+            auto s = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-static_cast<float>(x)));
+            return static_cast<T>(static_cast<float>(grad_out) * s * (1.0f - s));
+        }
+    }
+} SigmoidBackwardOp;
+} // namespace op::sigmoid_backward::cuda
+
+#endif // __SIGMOID_BACKWARD_CUDA_H__
diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h
new file mode 100644
index 000000000..fa1708559
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SIGMOID_BACKWARD_METAX_API_H__
+#define __SIGMOID_BACKWARD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, metax)
+
+#endif // __SIGMOID_BACKWARD_METAX_API_H__
diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca
new file mode 100644
index 000000000..ed99ac65d
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca
@@ -0,0 +1,62 @@
+#include "sigmoid_backward_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::sigmoid_backward::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::metax
diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu
new file mode 100644
index 000000000..e7e604af4
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sigmoid_backward_nvidia.cuh"
+
+namespace op::sigmoid_backward::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::nvidia
diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh
new file mode 100644
index 000000000..822f870fe
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIGMOID_BACKWARD_CUDA_API_H__
+#define __SIGMOID_BACKWARD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, nvidia)
+
+#endif // __SIGMOID_BACKWARD_CUDA_API_H__
diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc
new file mode 100644
index 000000000..f30a646d0
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sigmoid_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sigmoid_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/sigmoid_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/sigmoid_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopSigmoidBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t grad_output_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::sigmoid_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::sigmoid_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                        \
+            {input_desc,                                                       \
+             grad_output_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::sigmoid_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSigmoidBackward(
+    infiniopSigmoidBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *input,
+    const void *grad_output,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::sigmoid_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::sigmoid_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/sigmoid_backward.py b/test/infiniop/sigmoid_backward.py
new file mode 100644
index 000000000..813791aa8
--- /dev/null
+++ b/test/infiniop/sigmoid_backward.py
@@ -0,0 +1,184 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, input_stride, grad_output_stride, grad_input_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+    INPLACE_GRAD_OUTPUT = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_INPUT,
+    Inplace.INPLACE_GRAD_OUTPUT,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sigmoid_backward(grad_input, input_tensor, grad_output):
+    sigmoid_input = torch.sigmoid(input_tensor)
+    grad_input.copy_(grad_output * sigmoid_input * (1 - sigmoid_input))
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    grad_output_stride=None,
+    grad_input_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input_tensor = TestTensor(shape, input_stride, dtype, device)
+    grad_output = TestTensor(shape, grad_output_stride, dtype, device)
+    
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != grad_input_stride:
+            return
+        grad_input = input_tensor
+    elif inplace == Inplace.INPLACE_GRAD_OUTPUT:
+        if grad_input_stride != grad_output_stride:
+            return
+        grad_input = grad_output
+    else:
+        grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="ones")
+
+    if grad_input.is_broadcast():
+        return
+
+    print(
+        f"Testing SigmoidBackward on {InfiniDeviceNames[device]} with shape:{shape} "
+        f"input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    sigmoid_backward(grad_input.torch_tensor(), input_tensor.torch_tensor(), grad_output.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSigmoidBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_input.descriptor,
+            input_tensor.descriptor,
+            grad_output.descriptor,
+        )
+    )
+
+    for tensor in [input_tensor, grad_output, grad_input]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSigmoidBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, grad_input.device)
+
+    def lib_sigmoid_backward():
+        check_error(
+            LIBINFINIOP.infiniopSigmoidBackward(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                grad_input.data(),
+                input_tensor.data(),
+                grad_output.data(),
+                None,
+            )
+        )
+
+    lib_sigmoid_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sigmoid_backward(grad_input.torch_tensor(), input_tensor.torch_tensor(), grad_output.torch_tensor()), 
+                         device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sigmoid_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySigmoidBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From f38ea0d3f074520d01a7e2aadbcc8a63ed3984e4 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:13:25 +0800
Subject: [PATCH 08/16] [T1-1-1]: Hardswish operator with cpu nvidia metax
 iluvatar and test

---
 include/infiniop/ops/hardswish.h              |  24 +++
 src/infiniop-test/src/ops/hardswish.cpp       | 114 ++++++++++++
 .../ops/hardswish/cpu/hardswish_cpu.cc        |  52 ++++++
 .../ops/hardswish/cpu/hardswish_cpu.h         |  30 ++++
 src/infiniop/ops/hardswish/cuda/kernel.cuh    |  56 ++++++
 .../ops/hardswish/metax/hardswish_metax.h     |   8 +
 .../ops/hardswish/metax/hardswish_metax.maca  |  60 +++++++
 .../ops/hardswish/nvidia/hardswish_nvidia.cu  |  59 +++++++
 .../ops/hardswish/nvidia/hardswish_nvidia.cuh |   8 +
 src/infiniop/ops/hardswish/operator.cc        | 142 +++++++++++++++
 test/infiniop/hardswish.py                    | 167 ++++++++++++++++++
 11 files changed, 720 insertions(+)
 create mode 100644 include/infiniop/ops/hardswish.h
 create mode 100644 src/infiniop-test/src/ops/hardswish.cpp
 create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
 create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
 create mode 100644 src/infiniop/ops/hardswish/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/hardswish/metax/hardswish_metax.h
 create mode 100644 src/infiniop/ops/hardswish/metax/hardswish_metax.maca
 create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
 create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
 create mode 100644 src/infiniop/ops/hardswish/operator.cc
 create mode 100644 test/infiniop/hardswish.py

diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
new file mode 100644
index 000000000..79a7c93ea
--- /dev/null
+++ b/include/infiniop/ops/hardswish.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_HARDSWISH_API_H__
+#define __INFINIOP_HARDSWISH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle,
+                                                        infiniopHardswishDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp
new file mode 100644
index 000000000..25b161ccf
--- /dev/null
+++ b/src/infiniop-test/src/ops/hardswish.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::hardswish {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopHardswishDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopHardswish(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "output", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"output"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::hardswish
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
new file mode 100644
index 000000000..e7b68508a
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
@@ -0,0 +1,52 @@
+#include "hardswish_cpu.h"
+
+namespace op::hardswish::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<HardswishOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<HardswishOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<HardswishOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<HardswishOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::cpu
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
new file mode 100644
index 000000000..a42009017
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
@@ -0,0 +1,30 @@
+#ifndef __HARDSWISH_CPU_H__
+#define __HARDSWISH_CPU_H__
+
+#include <algorithm>
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
+
+namespace op::hardswish::cpu {
+typedef struct HardswishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        if constexpr (std::is_integral_v<T>) {
+            return static_cast<T>(0);
+        } else {
+            // x * clamp(x + 3, 0, 6) / 6
+            auto x = static_cast<double>(input);
+            double y = x + 3.0;
+            y = std::min(std::max(y, 0.0), 6.0);
+            double out = x * (y / 6.0);
+            return static_cast<T>(out);
+        }
+    }
+} HardswishOp;
+} // namespace op::hardswish::cpu
+
+#endif // __HARDSWISH_CPU_H__
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
new file mode 100644
index 000000000..be22e5faa
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -0,0 +1,56 @@
+#ifndef __HARDSWISH_CUDA_H__
+#define __HARDSWISH_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+
+namespace op::hardswish::cuda {
+
+typedef struct HardswishOp {
+  static constexpr size_t num_inputs = 1;
+
+  // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
+  __device__ __forceinline__ float hswish_f32(float x) const {
+    float y = x + 3.0f;
+    y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
+    return x * (y * (1.0f / 6.0f));
+  }
+
+  template <typename T>
+  __device__ __forceinline__ T operator()(const T &input) const {
+    if constexpr (std::is_same_v<T, half2>) {
+      float2 vf = __half22float2(input);
+      float2 vr = make_float2(
+        hswish_f32(vf.x),
+        hswish_f32(vf.y)
+      );
+      return __float22half2_rn(vr);
+    } else if constexpr (std::is_same_v<T, half>) {
+      float xf = __half2float(input);
+      float yf = hswish_f32(xf);
+      return __float2half_rn(yf);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+      float f0 = __bfloat162float(__low2bfloat16(input));
+      float f1 = __bfloat162float(__high2bfloat16(input));
+      return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+      float xf = __bfloat162float(input);
+      return __float2bfloat16_rz(hswish_f32(xf));
+    } else if constexpr (std::is_same_v<T, float>) {
+      return hswish_f32(input);
+    } else if constexpr (std::is_same_v<T, double>) {
+      double xd = static_cast<double>(input);
+      double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+      return static_cast<T>(yd);
+    } else {
+      double xd = static_cast<double>(input);
+      double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+      return static_cast<T>(yd);
+    }
+  }
+} HardswishOp;
+
+} // namespace op::hardswish::cuda
+
+#endif // __HARDSWISH_CUDA_H__
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
new file mode 100644
index 000000000..16b131aa9
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_METAX_API_H__
+#define __HARDSWISH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, metax)
+
+#endif // __HARDSWISH_METAX_API_H__
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
new file mode 100644
index 000000000..e53b94357
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
@@ -0,0 +1,60 @@
+#include "hardswish_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::hardswish::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::metax
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
new file mode 100644
index 000000000..0aff55cd2
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "hardswish_nvidia.cuh"
+
+namespace op::hardswish::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::nvidia
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
new file mode 100644
index 000000000..f869ad52f
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_CUDA_API_H__
+#define __HARDSWISH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
+
+#endif // __HARDSWISH_CUDA_API_H__
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
new file mode 100644
index 000000000..7787c799b
--- /dev/null
+++ b/src/infiniop/ops/hardswish/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/hardswish.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/hardswish_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/hardswish_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/hardswish_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateHardswishDescriptor(
+    infiniopHandle_t handle,
+    infiniopHardswishDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::hardswish::NAMESPACE::Descriptor::create(                      \
+            handle,                                                         \
+            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr),  \
+            output_desc,                                                    \
+            {input_desc})                                                   \
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopHardswish(
+    infiniopHardswishDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+    delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py
new file mode 100644
index 000000000..424b30567
--- /dev/null
+++ b/test/infiniop/hardswish.py
@@ -0,0 +1,167 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def hardswish(output, input):
+    output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    hardswish(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateHardswishDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetHardswishWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_hardswish():
+        check_error(
+            LIBINFINIOP.infiniopHardswish(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_hardswish()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 7c848685877e27582c2bef535ac1ebada7cbf613 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:14:16 +0800
Subject: [PATCH 09/16] [T1-1-1]: Cast operator with cpu nvidia metax iluvatar
 and test

---
 include/infiniop/ops/cast.h                  |  24 ++
 src/infiniop-test/src/ops/cast.cpp           | 122 ++++++++++
 src/infiniop/ops/cast/cast.h                 |  48 ++++
 src/infiniop/ops/cast/cpu/cast_cpu.cc        | 135 ++++++++++
 src/infiniop/ops/cast/cpu/cast_cpu.h         |   8 +
 src/infiniop/ops/cast/cuda/kernel.cuh        |  75 ++++++
 src/infiniop/ops/cast/info.h                 |  58 +++++
 src/infiniop/ops/cast/metax/cast_metax.h     |   8 +
 src/infiniop/ops/cast/metax/cast_metax.maca  | 201 +++++++++++++++
 src/infiniop/ops/cast/nvidia/cast_nvidia.cu  | 205 ++++++++++++++++
 src/infiniop/ops/cast/nvidia/cast_nvidia.cuh |   8 +
 src/infiniop/ops/cast/operator.cc            | 142 +++++++++++
 test/infiniop/cast.py                        | 244 +++++++++++++++++++
 13 files changed, 1278 insertions(+)
 create mode 100644 include/infiniop/ops/cast.h
 create mode 100644 src/infiniop-test/src/ops/cast.cpp
 create mode 100644 src/infiniop/ops/cast/cast.h
 create mode 100644 src/infiniop/ops/cast/cpu/cast_cpu.cc
 create mode 100644 src/infiniop/ops/cast/cpu/cast_cpu.h
 create mode 100644 src/infiniop/ops/cast/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/cast/info.h
 create mode 100644 src/infiniop/ops/cast/metax/cast_metax.h
 create mode 100644 src/infiniop/ops/cast/metax/cast_metax.maca
 create mode 100644 src/infiniop/ops/cast/nvidia/cast_nvidia.cu
 create mode 100644 src/infiniop/ops/cast/nvidia/cast_nvidia.cuh
 create mode 100644 src/infiniop/ops/cast/operator.cc
 create mode 100644 test/infiniop/cast.py

diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h
new file mode 100644
index 000000000..82b41490e
--- /dev/null
+++ b/include/infiniop/ops/cast.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_CAST_API_H__
+#define __INFINIOP_CAST_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCastDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle,
+                                                        infiniopCastDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/cast.cpp b/src/infiniop-test/src/ops/cast.cpp
new file mode 100644
index 000000000..d91f5eb6c
--- /dev/null
+++ b/src/infiniop-test/src/ops/cast.cpp
@@ -0,0 +1,122 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cast {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_I32) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I64) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_F64) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopCastDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopCast(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopCast(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() { 
+    return {}; 
+}
+
+std::vector<std::string> Test::tensor_names() { 
+    return {"input", "output", "ans"}; 
+}
+
+std::vector<std::string> Test::output_names() { 
+    return {"output"}; 
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+}  // namespace infiniop_test::cast
diff --git a/src/infiniop/ops/cast/cast.h b/src/infiniop/ops/cast/cast.h
new file mode 100644
index 000000000..5e66997cc
--- /dev/null
+++ b/src/infiniop/ops/cast/cast.h
@@ -0,0 +1,48 @@
+#ifndef __CAST_H__
+#define __CAST_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                             \
+                                                          \
+    namespace op::cast::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+                                                          \
+        CastInfo _info;                                   \
+        size_t _min_workspace_size;                       \
+                                                          \
+        Descriptor(                                       \
+            CastInfo info,                                \
+            size_t min_workspace_size,                    \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _info(info),                                \
+              _min_workspace_size(min_workspace_size) {}  \
+                                                          \
+    public:                                               \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t out_desc,          \
+            infiniopTensorDescriptor_t in_desc);          \
+                                                          \
+        size_t workspaceSize() const;                     \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *workspace,                              \
+            size_t workspace_size,                        \
+            void *output,                                 \
+            const void *input,                            \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+#endif // __CAST_H__
diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc
new file mode 100644
index 000000000..9a8bcc5a5
--- /dev/null
+++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc
@@ -0,0 +1,135 @@
+#include "cast_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../info.h"
+#include "infinicore.h"
+#include <algorithm>
+
+namespace op::cast::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info_r = CastInfo::create(out_desc, in_desc);
+    CHECK_RESULT(info_r);
+
+    *desc_ptr = new Descriptor(
+        info_r.take(),
+        0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <typename Tout, typename Tin>
+static inline void cpu_cast_impl_incremental(
+    void *output, const void *input, const op::cast::CastInfo &info) {
+
+    const size_t ndim = info.shape.size();
+    const size_t n = info.n;
+
+    auto out_base = reinterpret_cast<Tout *>(output);
+    auto in_base = reinterpret_cast<const Tin *>(input);
+
+    const std::vector<size_t> &shape = info.shape;
+    const std::vector<ptrdiff_t> &in_stride = info.in_stride;
+    const std::vector<ptrdiff_t> &out_stride = info.out_stride;
+
+    if (n == 0) return;
+
+    std::vector<size_t> idx(ndim, 0);
+    ptrdiff_t in_off = 0;
+    ptrdiff_t out_off = 0;
+
+    for (size_t it = 0; it < n; ++it) {
+        const Tin *in_elem = in_base + in_off;
+        Tout *out_elem = out_base + out_off;
+        *out_elem = utils::cast<Tout, Tin>(*in_elem);
+
+        for (int d = static_cast<int>(ndim) - 1; d >= 0; --d) {
+            idx[d] += 1;
+            if (in_stride[d] != 0) in_off += in_stride[d];
+            if (out_stride[d] != 0) out_off += out_stride[d];
+
+            if (idx[d] < shape[d]) {
+                break;
+            } else {
+                idx[d] = 0;
+                if (in_stride[d] != 0) in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
+                if (out_stride[d] != 0) out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (output == const_cast<void*>(input)) {
+        return INFINI_STATUS_BAD_PARAM; // or INFINI_STATUS_INPLACE_NOT_SUPPORTED
+    }
+
+    #define CASE_OUT(DT_OUT, TOUT)                                    \
+        case DT_OUT: {                                                \
+            switch (_info.dt_in) {                                    \
+            case INFINI_DTYPE_I32:                                    \
+                cpu_cast_impl_incremental<TOUT, int32_t>(output, input, _info); \
+                break;                                                \
+            case INFINI_DTYPE_I64:                                    \
+                cpu_cast_impl_incremental<TOUT, int64_t>(output, input, _info); \
+                break;                                                \
+            case INFINI_DTYPE_U32:                                    \
+                cpu_cast_impl_incremental<TOUT, uint32_t>(output, input, _info); \
+                break;                                                \
+            case INFINI_DTYPE_U64:                                    \
+                cpu_cast_impl_incremental<TOUT, uint64_t>(output, input, _info); \
+                break;                                                \
+            case INFINI_DTYPE_F16:                                    \
+                cpu_cast_impl_incremental<TOUT, fp16_t>(output, input, _info); \
+                break;                                                \
+            case INFINI_DTYPE_F32:                                    \
+                cpu_cast_impl_incremental<TOUT, float>(output, input, _info); \
+                break;                                                \
+            case INFINI_DTYPE_F64:                                    \
+                cpu_cast_impl_incremental<TOUT, double>(output, input, _info); \
+                break;                                                \
+            default:                                                  \
+                return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                 \
+            }                                                         \
+            break;                                                    \
+        }
+
+    switch (_info.dt_out) {
+        CASE_OUT(INFINI_DTYPE_I32, int32_t);
+        CASE_OUT(INFINI_DTYPE_I64, int64_t);
+        CASE_OUT(INFINI_DTYPE_U32, uint32_t);
+        CASE_OUT(INFINI_DTYPE_U64, uint64_t);
+        CASE_OUT(INFINI_DTYPE_F16, fp16_t);
+        CASE_OUT(INFINI_DTYPE_F32, float);
+        CASE_OUT(INFINI_DTYPE_F64, double);
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    #undef CASE_OUT
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+} // namespace op::cast::cpu
diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.h b/src/infiniop/ops/cast/cpu/cast_cpu.h
new file mode 100644
index 000000000..ca929a694
--- /dev/null
+++ b/src/infiniop/ops/cast/cpu/cast_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __CAST_CPU_H__
+#define __CAST_CPU_H__
+
+#include "../cast.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __CAST_CPU_H__
diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh
new file mode 100644
index 000000000..eee801b12
--- /dev/null
+++ b/src/infiniop/ops/cast/cuda/kernel.cuh
@@ -0,0 +1,75 @@
+#ifndef __CAST_CUDA_KERNEL_CUH__
+#define __CAST_CUDA_KERNEL_CUH__
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+#include <type_traits>
+
+template <typename Tout, typename Tin>
+__device__ __forceinline__ Tout device_cast(const Tin &v) {
+    if constexpr (std::is_same_v<Tout, half>) {
+        float f;
+        if constexpr (std::is_same_v<Tin, half>) {
+            f = __half2float(v);
+        } else {
+            f = static_cast<float>(v);
+        }
+        return __float2half_rn(f);
+    } else if constexpr (std::is_same_v<Tout, float>) {
+        if constexpr (std::is_same_v<Tin, half>) {
+            return __half2float(v);
+        } else {
+            return static_cast<float>(v);
+        }
+    } else if constexpr (std::is_same_v<Tout, double>) {
+        if constexpr (std::is_same_v<Tin, half>) {
+            return static_cast<double>(__half2float(v));
+        } else {
+            return static_cast<double>(v);
+        }
+    } else { // integer outputs
+        // convert via double/float then to integer (truncate)
+        if constexpr (std::is_same_v<Tin, half>) {
+            float f = __half2float(v);
+            return static_cast<Tout>(f);
+        } else {
+            return static_cast<Tout>(v);
+        }
+    }
+}
+
+template <class ToutDev, class TinDev>
+__global__ void cast_kernel(
+    ToutDev *__restrict__ out,
+    const TinDev *__restrict__ in,
+    size_t n,
+    const size_t *__restrict__ shape,
+    const size_t *__restrict__ div,
+    const long long *__restrict__ in_stride,
+    const long long *__restrict__ out_stride,
+    int ndim) {
+
+    size_t gid = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t grid_stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+
+    for (size_t linear = gid; linear < n; linear += grid_stride) {
+        unsigned long long rem = linear;
+        long long in_off = 0;
+        long long out_off = 0;
+        for (int d = 0; d < ndim; ++d) {
+            unsigned long long idx_d = 0;
+            size_t divisor = div[d];
+            if (divisor != 0) {
+                idx_d = rem / divisor;
+                rem = rem % divisor;
+            } else {
+                idx_d = 0;
+            }
+            if (in_stride[d] != 0) in_off += static_cast<long long>(idx_d) * in_stride[d];
+            if (out_stride[d] != 0) out_off += static_cast<long long>(idx_d) * out_stride[d];
+        }
+        out[static_cast<size_t>(out_off)] = device_cast<ToutDev, TinDev>(in[static_cast<size_t>(in_off)]);
+    }
+}
+
+#endif // __CAST_CUDA_KERNEL_CUH__
diff --git a/src/infiniop/ops/cast/info.h b/src/infiniop/ops/cast/info.h
new file mode 100644
index 000000000..4283a8224
--- /dev/null
+++ b/src/infiniop/ops/cast/info.h
@@ -0,0 +1,58 @@
+#ifndef __CAST_INFO_H__
+#define __CAST_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::cast {
+
+class CastInfo {
+    CastInfo() = default;
+
+public:    
+    infiniDtype_t dt_in;
+    infiniDtype_t dt_out;
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> in_stride;
+    std::vector<ptrdiff_t> out_stride;
+    size_t n;
+
+    static utils::Result<CastInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc) {
+        
+        auto dt_out = out_desc->dtype();
+        auto dt_in  = in_desc->dtype();
+
+        CHECK_DTYPE(dt_in,
+                    INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                    INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                    INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_DTYPE(dt_out,
+                    INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                    INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                    INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(out_desc->ndim() == in_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        for (size_t i = 0; i < out_desc->ndim(); ++i) {
+            CHECK_OR_RETURN(out_desc->dim(i) == in_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+
+        size_t n = 1;
+        for (size_t i = 0; i < in_desc->ndim(); ++i) n *= static_cast<size_t>(in_desc->dim(i));
+
+        return utils::Result<CastInfo>(CastInfo{
+            dt_in, 
+            dt_out, 
+            out_desc->shape(), 
+            in_desc->strides(), 
+            out_desc->strides(), 
+            n,
+        });
+    }
+};
+
+} // namespace op::cast
+
+#endif // __CAST_INFO_H__
diff --git a/src/infiniop/ops/cast/metax/cast_metax.h b/src/infiniop/ops/cast/metax/cast_metax.h
new file mode 100644
index 000000000..5ba92911e
--- /dev/null
+++ b/src/infiniop/ops/cast/metax/cast_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CAST_METAX_API_H__
+#define __CAST_METAX_API_H__
+
+#include "../cast.h"
+
+DESCRIPTOR(metax)
+
+#endif // __CAST_METAX_API_H__
diff --git a/src/infiniop/ops/cast/metax/cast_metax.maca b/src/infiniop/ops/cast/metax/cast_metax.maca
new file mode 100644
index 000000000..4b2103da3
--- /dev/null
+++ b/src/infiniop/ops/cast/metax/cast_metax.maca
@@ -0,0 +1,201 @@
+#include "../cuda/kernel.cuh"
+#include "../../../devices/metax/metax_common.h"
+#include "../cast.h"
+#include "cast_metax.h"
+#include "../info.h"
+
+namespace op::cast::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+template <typename T> struct MapHcType { using Type = T; };
+template <> struct MapHcType<fp16_t> { using Type = half; };
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info_r = CastInfo::create(out_desc, in_desc);
+    CHECK_RESULT(info_r);
+    auto info = info_r.take();
+
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <class ToutHost, class TinHost>
+static inline infiniStatus_t metax_cast_impl_incremental(
+    void *output_, const void *input_, 
+    const op::cast::CastInfo &info, 
+    void *stream_) {
+
+    int bs = 256, grid = 0;
+    hcError_t propErr;
+    int device_id_local = 0;
+    using DevTout = typename MapHcType<ToutHost>::Type;
+    using DevTin  = typename MapHcType<TinHost>::Type;
+
+    auto out_dev = reinterpret_cast<DevTout *>(output_);
+    auto in_dev  = reinterpret_cast<const DevTin *>(input_);
+    auto stream = reinterpret_cast<hcStream_t>(stream_);
+
+    int ndim = static_cast<int>(info.shape.size());
+    if (ndim == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    std::vector<size_t> h_shape(info.shape.begin(), info.shape.end());
+    std::vector<size_t> h_div(ndim);
+    h_div[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+        h_div[d] = h_div[d + 1] * h_shape[d + 1];
+    }
+
+    std::vector<long long> h_in_stride(ndim), h_out_stride(ndim);
+    for (int d = 0; d < ndim; ++d) {
+        h_in_stride[d] = static_cast<long long>(info.in_stride[d]);
+        h_out_stride[d] = static_cast<long long>(info.out_stride[d]);
+    }
+
+    size_t *d_shape = nullptr;
+    size_t *d_div = nullptr;
+    long long *d_in_stride = nullptr;
+    long long *d_out_stride = nullptr;
+
+    hcError_t err = hcSuccess;
+    err = hcMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
+    if (err != hcSuccess) goto cleanup;
+
+    err = hcMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+
+    device_id_local = 0;
+    propErr = hcGetDevice(&device_id_local);
+    if (propErr == hcSuccess) {
+        hcDeviceProp_t prop;
+        if (hcGetDeviceProperties(&prop, device_id_local) == hcSuccess) {
+            bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
+        } else {
+            if (bs > 256) bs = 256;
+        }
+    } else {
+        if (bs > 256) bs = 256;
+    }
+
+    if (bs <= 0) bs = 256;
+    grid = static_cast<int>((info.n + bs - 1) / bs);
+    if (grid <= 0) grid = 1;
+
+    cast_kernel<DevTout, DevTin><<<grid, bs, 0, stream>>>(
+        out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim);
+
+    err = hcGetLastError();
+    if (err != hcSuccess) goto cleanup;
+
+    err = hcStreamSynchronize(stream);
+    if (err != hcSuccess) goto cleanup;
+
+    hcFree(d_shape);
+    hcFree(d_div);
+    hcFree(d_in_stride);
+    hcFree(d_out_stride);
+    return INFINI_STATUS_SUCCESS;
+
+cleanup:
+    hcFree(d_shape);
+    hcFree(d_div);
+    hcFree(d_in_stride);
+    hcFree(d_out_stride);
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (output == const_cast<void*>(input)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    #define CASE_OUT(DT_OUT, TOUT)                                    \
+        case DT_OUT: {                                                \
+            switch (_info.dt_in) {                                    \
+            case INFINI_DTYPE_I32:                                    \
+                metax_cast_impl_incremental<TOUT, int32_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_I64:                                    \
+                metax_cast_impl_incremental<TOUT, int64_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_U32:                                    \
+                metax_cast_impl_incremental<TOUT, uint32_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_U64:                                    \
+                metax_cast_impl_incremental<TOUT, uint64_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F16:                                    \
+                metax_cast_impl_incremental<TOUT, fp16_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F32:                                    \
+                metax_cast_impl_incremental<TOUT, float>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F64:                                    \
+                metax_cast_impl_incremental<TOUT, double>(output, input, _info, stream); \
+                break;                                                \
+            default:                                                  \
+                return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                 \
+            }                                                         \
+            break;                                                    \
+        }
+
+    switch (_info.dt_out) {
+        CASE_OUT(INFINI_DTYPE_I32, int32_t);
+        CASE_OUT(INFINI_DTYPE_I64, int64_t);
+        CASE_OUT(INFINI_DTYPE_U32, uint32_t);
+        CASE_OUT(INFINI_DTYPE_U64, uint64_t);
+        CASE_OUT(INFINI_DTYPE_F16, fp16_t);
+        CASE_OUT(INFINI_DTYPE_F32, float);
+        CASE_OUT(INFINI_DTYPE_F64, double);
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    #undef CASE_OUT
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace op::cast::metax
diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
new file mode 100644
index 000000000..2ad20c203
--- /dev/null
+++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
@@ -0,0 +1,205 @@
+#include "../cuda/kernel.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../cast.h"
+#include "cast_nvidia.cuh"
+#include "../info.h"
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <vector>
+#include <cstring>
+
+namespace op::cast::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+template <typename T> struct MapCudaType { using Type = T; };
+template <> struct MapCudaType<fp16_t> { using Type = half; };
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    auto info_r = CastInfo::create(out_desc, in_desc);
+    CHECK_RESULT(info_r);
+    auto info = info_r.take();
+
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <class ToutHost, class TinHost>
+static inline infiniStatus_t cuda_cast_impl_incremental(
+    void *output_, const void *input_, 
+    const op::cast::CastInfo &info, 
+    void *stream_) {
+
+    int bs = 256, grid = 0;
+    cudaError_t propErr;
+    int device_id_local = 0;
+    using DevTout = typename MapCudaType<ToutHost>::Type;
+    using DevTin  = typename MapCudaType<TinHost>::Type;
+
+    auto out_dev = reinterpret_cast<DevTout *>(output_);
+    auto in_dev  = reinterpret_cast<const DevTin *>(input_);
+    auto stream = reinterpret_cast<cudaStream_t>(stream_);
+
+    int ndim = static_cast<int>(info.shape.size());
+    if (ndim == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    std::vector<size_t> h_shape(info.shape.begin(), info.shape.end());
+    std::vector<size_t> h_div(ndim);
+    h_div[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+        h_div[d] = h_div[d + 1] * h_shape[d + 1];
+    }
+
+    std::vector<long long> h_in_stride(ndim), h_out_stride(ndim);
+    for (int d = 0; d < ndim; ++d) {
+        h_in_stride[d] = static_cast<long long>(info.in_stride[d]);
+        h_out_stride[d] = static_cast<long long>(info.out_stride[d]);
+    }
+
+    size_t *d_shape = nullptr;
+    size_t *d_div = nullptr;
+    long long *d_in_stride = nullptr;
+    long long *d_out_stride = nullptr;
+
+    cudaError_t err = cudaSuccess;
+    err = cudaMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
+    if (err != cudaSuccess) goto cleanup;
+
+    err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) goto cleanup;
+    err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) goto cleanup;
+
+    device_id_local = 0;
+    propErr = cudaGetDevice(&device_id_local);
+    if (propErr == cudaSuccess) {
+        cudaDeviceProp prop;
+        if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) {
+            bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
+        } else {
+            if (bs > 256) bs = 256;
+        }
+    } else {
+        if (bs > 256) bs = 256;
+    }
+
+    if (bs <= 0) bs = 256;
+    grid = static_cast<int>((info.n + bs - 1) / bs);
+    if (grid <= 0) grid = 1;
+
+    cast_kernel<DevTout, DevTin><<<grid, bs, 0, stream>>>(
+        out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim);
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) goto cleanup;
+
+    err = cudaStreamSynchronize(stream);
+    if (err != cudaSuccess) goto cleanup;
+
+    cudaFree(d_shape);
+    cudaFree(d_div);
+    cudaFree(d_in_stride);
+    cudaFree(d_out_stride);
+    return INFINI_STATUS_SUCCESS;
+
+cleanup:
+    cudaFree(d_shape);
+    cudaFree(d_div);
+    cudaFree(d_in_stride);
+    cudaFree(d_out_stride);
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (output == const_cast<void*>(input)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    #define CASE_OUT(DT_OUT, TOUT)                                    \
+        case DT_OUT: {                                                \
+            switch (_info.dt_in) {                                    \
+            case INFINI_DTYPE_I32:                                    \
+                cuda_cast_impl_incremental<TOUT, int32_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_I64:                                    \
+                cuda_cast_impl_incremental<TOUT, int64_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_U32:                                    \
+                cuda_cast_impl_incremental<TOUT, uint32_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_U64:                                    \
+                cuda_cast_impl_incremental<TOUT, uint64_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F16:                                    \
+                cuda_cast_impl_incremental<TOUT, fp16_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F32:                                    \
+                cuda_cast_impl_incremental<TOUT, float>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F64:                                    \
+                cuda_cast_impl_incremental<TOUT, double>(output, input, _info, stream); \
+                break;                                                \
+            default:                                                  \
+                return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                 \
+            }                                                         \
+            break;                                                    \
+        }
+
+    switch (_info.dt_out) {
+        CASE_OUT(INFINI_DTYPE_I32, int32_t);
+        CASE_OUT(INFINI_DTYPE_I64, int64_t);
+        CASE_OUT(INFINI_DTYPE_U32, uint32_t);
+        CASE_OUT(INFINI_DTYPE_U64, uint64_t);
+        CASE_OUT(INFINI_DTYPE_F16, fp16_t);
+        CASE_OUT(INFINI_DTYPE_F32, float);
+        CASE_OUT(INFINI_DTYPE_F64, double);
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    #undef CASE_OUT
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace op::cast::nvidia
diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh
new file mode 100644
index 000000000..032e1fb2e
--- /dev/null
+++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CAST_CUDA_API_H__
+#define __CAST_CUDA_API_H__
+
+#include "../cast.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __CAST_CUDA_API_H__
diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc
new file mode 100644
index 000000000..fc3aef4ad
--- /dev/null
+++ b/src/infiniop/ops/cast/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cast.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cast_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cast_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cast_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCastDescriptor(
+    infiniopHandle_t handle,
+    infiniopCastDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::cast::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::cast::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            input_desc)                                                     \
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::cast::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCast(
+    infiniopCastDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+    delete reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/cast.py b/test/infiniop/cast.py
new file mode 100644
index 000000000..87b572741
--- /dev/null
+++ b/test/infiniop/cast.py
@@ -0,0 +1,244 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+    to_torch_dtype,
+    torch_device_map
+)
+import itertools
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+_TEST_CASES = [
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+_INTEGER_DTYPES = [
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.U32,
+    InfiniDtype.U64,
+]
+
+_FLOAT_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+]
+
+def is_supported_dt(inf_dt):
+    try:
+        td = to_torch_dtype(inf_dt, compatability_mode=True)
+        _ = torch.empty((1,), dtype=td, device="cpu")
+        return True
+    except Exception:
+        return False
+
+_TOLERANCE_MAP = {
+    ("float", "float"): {"atol": 1e-3, "rtol": 1e-3},
+    ("int", "float"): {"atol": 1.0, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def _is_integer_dtype(inf_dt):
+    return inf_dt in _INTEGER_DTYPES
+
+
+def _is_float_dtype(inf_dt):
+    return inf_dt in _FLOAT_DTYPES
+
+
+def _is_unsigned_dtype(inf_dt):
+    return inf_dt in (InfiniDtype.U32, InfiniDtype.U64)
+
+
+def reference_cast_torch(output_tensor, input_tensor):
+    converted = input_tensor.to(dtype=output_tensor.dtype, device=output_tensor.device).clone()
+    output_tensor.copy_(converted)
+
+
+def make_integer_torch_tensor(shape, inf_dt, device):
+    use_compatibility = _is_unsigned_dtype(inf_dt)
+    
+    if inf_dt == InfiniDtype.I32:
+        low, high, dtype = -2000, 2000, torch.int32
+    elif inf_dt == InfiniDtype.I64:
+        low, high, dtype = -2048, 2048, torch.int64
+    elif inf_dt == InfiniDtype.U32:
+        low, high, dtype = 0, 2000, torch.int32
+    elif inf_dt == InfiniDtype.U64:
+        low, high, dtype = 0, 2048, torch.int64
+    else:
+        low, high, dtype = 0, 1, torch.int64
+
+    dev = torch_device_map[device]
+
+    t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev)
+
+    target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility)
+    if t.dtype != target_torch_dt:
+        t = t.to(dtype=target_torch_dt)
+
+    return t
+
+
+def test(
+    handle,
+    device,
+    shape,
+    in_stride,
+    out_stride,
+    dtype_pair,
+    sync=None, 
+):
+    in_dt, out_dt = dtype_pair
+
+    if not is_supported_dt(in_dt) or not is_supported_dt(out_dt):
+        print(f"Skipping test for in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} because dtype not supported on this platform")
+        return
+
+    try:
+        if _is_integer_dtype(in_dt):
+            in_torch = make_integer_torch_tensor(shape, in_dt, device)
+            input = TestTensor.from_torch(in_torch, in_dt, device)
+        else:
+            input = TestTensor(shape, in_stride, in_dt, device, mode="random")
+
+        output = TestTensor(shape, out_stride, out_dt, device, mode="zeros")
+
+        if output.is_broadcast():
+            return
+
+        print(f"Testing Cast on {InfiniDeviceNames[device]} shape={shape} in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} in_stride={in_stride} out_stride={out_stride}")
+
+        reference_cast_torch(output.actual_tensor(), input.torch_tensor())
+
+        expected = output.actual_tensor().clone()
+
+        descriptor = infiniopOperatorDescriptor_t()
+        check_error(
+            LIBINFINIOP.infiniopCreateCastDescriptor(
+                handle,
+                ctypes.byref(descriptor),
+                output.descriptor,
+                input.descriptor,
+            )
+        )
+
+        input.destroy_desc()
+        output.destroy_desc()
+
+        workspace_size = c_uint64(0)
+        check_error(LIBINFINIOP.infiniopGetCastWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
+        workspace = TestWorkspace(workspace_size.value, device)
+
+        def lib_cast():
+            check_error(
+                LIBINFINIOP.infiniopCast(
+                    descriptor,
+                    workspace.data(),
+                    workspace_size.value,
+                    output.data(),
+                    input.data(),
+                    None,
+                )
+            )
+
+        lib_cast()
+
+        actual = output.actual_tensor()
+
+        if _is_integer_dtype(in_dt) and _is_float_dtype(out_dt):
+            tol = _TOLERANCE_MAP[("int", "float")]
+            atol, rtol = tol["atol"], tol["rtol"]
+        elif _is_float_dtype(in_dt) and _is_float_dtype(out_dt):
+            tol = _TOLERANCE_MAP[("float", "float")]
+            atol, rtol = tol["atol"], tol["rtol"]
+        else:
+            atol, rtol = 0, 0
+
+        if DEBUG:
+            debug(actual, expected, atol=atol, rtol=rtol)
+
+        assert torch.allclose(actual, expected, atol=atol, rtol=rtol), \
+            f"Mismatch for in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} shape={shape}"
+
+        if PROFILE:
+            profile_operation("PyTorch", lambda: reference_cast_torch(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+            profile_operation("    lib", lambda: lib_cast(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+        check_error(LIBINFINIOP.infiniopDestroyCastDescriptor(descriptor))
+        
+    except RuntimeError as e:
+        if "not implemented for 'UInt32'" in str(e) or "not implemented for 'UInt64'" in str(e):
+            #print(f"Skipping unsupported operation: {e}")
+            return False
+        else:
+            raise
+
+
+def main():
+    args = get_args()
+    global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    integer_pairs = itertools.product(_INTEGER_DTYPES, _INTEGER_DTYPES)
+    float_pairs = itertools.product(_FLOAT_DTYPES, _FLOAT_DTYPES)
+    int_to_float_pairs = itertools.product(_INTEGER_DTYPES, _FLOAT_DTYPES)
+
+    all_pairs = list(set(itertools.chain(integer_pairs, float_pairs, int_to_float_pairs)))
+
+    supported_pairs = []
+    skipped_pairs = []
+    for pair in all_pairs:
+        in_dt, out_dt = pair
+        if is_supported_dt(in_dt) and is_supported_dt(out_dt):
+            supported_pairs.append(pair)
+        else:
+            skipped_pairs.append(pair)
+
+    print(f"Supported dtype pairs: {[(InfiniDtypeNames[in_d], InfiniDtypeNames[out_d]) for in_d, out_d in supported_pairs]}")
+    if skipped_pairs:
+        print(f"Warning: skipping unsupported dtype pairs: {[(InfiniDtypeNames[in_d], InfiniDtypeNames[out_d]) for in_d, out_d in skipped_pairs]}")
+
+    devices = get_test_devices(args)
+
+    for device in devices:
+        test_operator(device, test, _TEST_CASES, supported_pairs)
+
+    print("\033[92mAll cast tests passed!\033[0m")
+
+
+if __name__ == "__main__":
+    main()

From 53bfa538eae8054a151aebe117f18e17f105f1c5 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 17 Aug 2025 23:14:42 +0800
Subject: [PATCH 10/16] [T1-1-1]: Where operator with cpu nvidia metax iluvatar
 and test

---
 include/infiniop/ops/where.h                  |  28 ++
 src/infiniop-test/src/ops/cast.cpp            |  24 +-
 src/infiniop-test/src/ops/cos.cpp             |  14 +-
 src/infiniop-test/src/ops/exp.cpp             |  15 +-
 src/infiniop-test/src/ops/hardswish.cpp       |  24 +-
 src/infiniop-test/src/ops/leakyrelu.cpp       |  28 +-
 .../src/ops/sigmoid_backward.cpp              |  28 +-
 src/infiniop-test/src/ops/sin.cpp             |  14 +-
 src/infiniop-test/src/ops/tanh.cpp            |  24 +-
 src/infiniop-test/src/ops/where.cpp           | 151 +++++++++
 src/infiniop/ops/cast/cpu/cast_cpu.cc         |  83 ++---
 src/infiniop/ops/cast/cuda/kernel.cuh         |   8 +-
 src/infiniop/ops/cast/info.h                  |  20 +-
 src/infiniop/ops/cast/nvidia/cast_nvidia.cu   | 144 +++++----
 src/infiniop/ops/cast/operator.cc             |  16 +-
 src/infiniop/ops/cos/cpu/cos_cpu.h            |   2 +-
 src/infiniop/ops/cos/cuda/kernel.cuh          |  69 +++--
 src/infiniop/ops/cos/nvidia/cos_nvidia.cu     |   2 +-
 src/infiniop/ops/cos/operator.cc              |  20 +-
 src/infiniop/ops/exp/cpu/exp_cpu.h            |   2 +-
 src/infiniop/ops/exp/cuda/kernel.cuh          |  54 ++--
 src/infiniop/ops/exp/nvidia/exp_nvidia.cu     |   2 +-
 src/infiniop/ops/exp/operator.cc              |  20 +-
 .../ops/hardswish/cpu/hardswish_cpu.h         |   2 +-
 src/infiniop/ops/hardswish/cuda/kernel.cuh    |  79 +++--
 .../ops/hardswish/nvidia/hardswish_nvidia.cu  |   2 +-
 src/infiniop/ops/hardswish/operator.cc        |  28 +-
 .../ops/leakyrelu/cpu/leakyrelu_cpu.cc        |  40 ++-
 src/infiniop/ops/leakyrelu/cuda/kernel.cuh    |  10 +-
 src/infiniop/ops/leakyrelu/info.h             |   7 +-
 .../ops/leakyrelu/nvidia/leakyrelu_nvidia.cu  | 105 ++++---
 src/infiniop/ops/leakyrelu/operator.cc        |  28 +-
 .../cpu/sigmoid_backward_cpu.h                |   5 +-
 .../ops/sigmoid_backward/cuda/kernel.cuh      |   6 +-
 src/infiniop/ops/sigmoid_backward/operator.cc |  22 +-
 src/infiniop/ops/sin/cpu/sin_cpu.h            |   2 +-
 src/infiniop/ops/sin/cuda/kernel.cuh          |  52 ++--
 src/infiniop/ops/sin/nvidia/sin_nvidia.cu     |   2 +-
 src/infiniop/ops/sin/operator.cc              |  20 +-
 src/infiniop/ops/tanh/cpu/tanh_cpu.h          |   2 +-
 src/infiniop/ops/tanh/cuda/kernel.cuh         |  66 ++--
 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu   |   2 +-
 src/infiniop/ops/tanh/operator.cc             |  20 +-
 src/infiniop/ops/where/cpu/where_cpu.cc       |  84 +++++
 src/infiniop/ops/where/cpu/where_cpu.h        |  19 ++
 src/infiniop/ops/where/cuda/kernel.cuh        |  15 +
 src/infiniop/ops/where/metax/where_metax.h    |   8 +
 src/infiniop/ops/where/metax/where_metax.maca |  62 ++++
 src/infiniop/ops/where/nvidia/where_nvidia.cu |  91 ++++++
 .../ops/where/nvidia/where_nvidia.cuh         |   8 +
 src/infiniop/ops/where/operator.cc            | 148 +++++++++
 test/infiniop/where.py                        | 288 ++++++++++++++++++
 52 files changed, 1507 insertions(+), 508 deletions(-)
 create mode 100644 include/infiniop/ops/where.h
 create mode 100644 src/infiniop-test/src/ops/where.cpp
 create mode 100644 src/infiniop/ops/where/cpu/where_cpu.cc
 create mode 100644 src/infiniop/ops/where/cpu/where_cpu.h
 create mode 100644 src/infiniop/ops/where/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/where/metax/where_metax.h
 create mode 100644 src/infiniop/ops/where/metax/where_metax.maca
 create mode 100644 src/infiniop/ops/where/nvidia/where_nvidia.cu
 create mode 100644 src/infiniop/ops/where/nvidia/where_nvidia.cuh
 create mode 100644 src/infiniop/ops/where/operator.cc
 create mode 100644 test/infiniop/where.py

diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h
new file mode 100644
index 000000000..a328c312a
--- /dev/null
+++ b/include/infiniop/ops/where.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_WHERE_API_H__
+#define __INFINIOP_WHERE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+                                                        infiniopWhereDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b,
+                                                        infiniopTensorDescriptor_t condition);
+
+__C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        const void *condition,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/src/ops/cast.cpp b/src/infiniop-test/src/ops/cast.cpp
index d91f5eb6c..6547bc25a 100644
--- a/src/infiniop-test/src/ops/cast.cpp
+++ b/src/infiniop-test/src/ops/cast.cpp
@@ -58,8 +58,8 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     auto output = _attributes->output->to(device, device_id);
 
     CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc,
-                                         output->desc(),
-                                         input->desc()),
+                                          output->desc(),
+                                          input->desc()),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
     size_t workspace_size;
     CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size),
@@ -68,9 +68,9 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     CHECK_OR(infinirtMalloc(&workspace, workspace_size),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
     CHECK_OR(infiniopCast(op_desc, workspace, workspace_size,
-                         output->data(),
-                         input->data(),
-                         nullptr),
+                          output->data(),
+                          input->data(),
+                          nullptr),
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
@@ -94,16 +94,16 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     return TEST_PASSED(elapsed_time);
 }
 
-std::vector<std::string> Test::attribute_names() { 
-    return {}; 
+std::vector<std::string> Test::attribute_names() {
+    return {};
 }
 
-std::vector<std::string> Test::tensor_names() { 
-    return {"input", "output", "ans"}; 
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
 }
 
-std::vector<std::string> Test::output_names() { 
-    return {"output"}; 
+std::vector<std::string> Test::output_names() {
+    return {"output"};
 }
 
 std::string Test::toString() const {
@@ -119,4 +119,4 @@ std::string Test::toString() const {
 Test::~Test() {
     delete _attributes;
 }
-}  // namespace infiniop_test::cast
+} // namespace infiniop_test::cast
diff --git a/src/infiniop-test/src/ops/cos.cpp b/src/infiniop-test/src/ops/cos.cpp
index 7cae4574d..52de283af 100644
--- a/src/infiniop-test/src/ops/cos.cpp
+++ b/src/infiniop-test/src/ops/cos.cpp
@@ -86,16 +86,16 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     return TEST_PASSED(elapsed_time);
 }
 
-std::vector<std::string> Test::attribute_names() { 
-    return {}; 
+std::vector<std::string> Test::attribute_names() {
+    return {};
 }
 
-std::vector<std::string> Test::tensor_names() { 
-    return {"input", "output", "ans"}; 
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
 }
 
-std::vector<std::string> Test::output_names() { 
-    return {"output"}; 
+std::vector<std::string> Test::output_names() {
+    return {"output"};
 }
 
 std::string Test::toString() const {
@@ -111,4 +111,4 @@ std::string Test::toString() const {
 Test::~Test() {
     delete _attributes;
 }
-}  // namespace infiniop_test::cos
+} // namespace infiniop_test::cos
diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp
index 395408e15..070f8ef6b 100644
--- a/src/infiniop-test/src/ops/exp.cpp
+++ b/src/infiniop-test/src/ops/exp.cpp
@@ -41,7 +41,6 @@ std::shared_ptr<Test> Test::build(
         test->_atol = 1e-6;
     }
 
-
     return test;
 }
 
@@ -87,16 +86,16 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     return TEST_PASSED(elapsed_time);
 }
 
-std::vector<std::string> Test::attribute_names() { 
-    return {}; 
+std::vector<std::string> Test::attribute_names() {
+    return {};
 }
 
-std::vector<std::string> Test::tensor_names() { 
-    return {"input", "output", "ans"}; 
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
 }
 
-std::vector<std::string> Test::output_names() { 
-    return {"output"}; 
+std::vector<std::string> Test::output_names() {
+    return {"output"};
 }
 
 std::string Test::toString() const {
@@ -112,4 +111,4 @@ std::string Test::toString() const {
 Test::~Test() {
     delete _attributes;
 }
-}  // namespace infiniop_test::exp
+} // namespace infiniop_test::exp
diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp
index 25b161ccf..0ccf4f52a 100644
--- a/src/infiniop-test/src/ops/hardswish.cpp
+++ b/src/infiniop-test/src/ops/hardswish.cpp
@@ -50,8 +50,8 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     auto input = _attributes->input->to(device, device_id);
     auto output = _attributes->output->to(device, device_id);
     CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc,
-                                         output->desc(),
-                                         input->desc()),
+                                               output->desc(),
+                                               input->desc()),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
     size_t workspace_size;
     CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size),
@@ -60,9 +60,9 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     CHECK_OR(infinirtMalloc(&workspace, workspace_size),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
     CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size,
-                         output->data(),
-                         input->data(),
-                         nullptr),
+                               output->data(),
+                               input->data(),
+                               nullptr),
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
@@ -86,16 +86,16 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     return TEST_PASSED(elapsed_time);
 }
 
-std::vector<std::string> Test::attribute_names() { 
-    return {}; 
+std::vector<std::string> Test::attribute_names() {
+    return {};
 }
 
-std::vector<std::string> Test::tensor_names() { 
-    return {"input", "output", "ans"}; 
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
 }
 
-std::vector<std::string> Test::output_names() { 
-    return {"output"}; 
+std::vector<std::string> Test::output_names() {
+    return {"output"};
 }
 
 std::string Test::toString() const {
@@ -111,4 +111,4 @@ std::string Test::toString() const {
 Test::~Test() {
     delete _attributes;
 }
-}  // namespace infiniop_test::hardswish
+} // namespace infiniop_test::hardswish
diff --git a/src/infiniop-test/src/ops/leakyrelu.cpp b/src/infiniop-test/src/ops/leakyrelu.cpp
index c63741120..b7d9eb89c 100644
--- a/src/infiniop-test/src/ops/leakyrelu.cpp
+++ b/src/infiniop-test/src/ops/leakyrelu.cpp
@@ -54,11 +54,11 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     auto input = _attributes->input->to(device, device_id);
     auto output = _attributes->output->to(device, device_id);
     CHECK_OR(infiniopCreateLeakyreluDescriptor(handle, &op_desc,
-                                         output->desc(),
-                                         input->desc(),
-                                         _attributes->negative_slope),
+                                               output->desc(),
+                                               input->desc(),
+                                               _attributes->negative_slope),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
-             
+
     size_t workspace_size;
     CHECK_OR(infiniopGetLeakyreluWorkspaceSize(op_desc, &workspace_size),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
@@ -68,9 +68,9 @@ std::shared_ptr<infiniop_test::Result> Test::run(
                  return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
     }
     CHECK_OR(infiniopLeakyrelu(op_desc, workspace, workspace_size,
-                         output->data(),
-                         input->data(),
-                         nullptr),
+                               output->data(),
+                               input->data(),
+                               nullptr),
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
@@ -94,16 +94,16 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     return TEST_PASSED(elapsed_time);
 }
 
-std::vector<std::string> Test::attribute_names() { 
-    return {"negative_slope"}; 
+std::vector<std::string> Test::attribute_names() {
+    return {"negative_slope"};
 }
 
-std::vector<std::string> Test::tensor_names() { 
-    return {"input", "output", "ans"}; 
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
 }
 
-std::vector<std::string> Test::output_names() { 
-    return {"output"}; 
+std::vector<std::string> Test::output_names() {
+    return {"output"};
 }
 
 std::string Test::toString() const {
@@ -120,4 +120,4 @@ std::string Test::toString() const {
 Test::~Test() {
     delete _attributes;
 }
-}  // namespace infiniop_test::leakyrelu
+} // namespace infiniop_test::leakyrelu
diff --git a/src/infiniop-test/src/ops/sigmoid_backward.cpp b/src/infiniop-test/src/ops/sigmoid_backward.cpp
index 116055300..434dbf598 100644
--- a/src/infiniop-test/src/ops/sigmoid_backward.cpp
+++ b/src/infiniop-test/src/ops/sigmoid_backward.cpp
@@ -54,9 +54,9 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     auto grad_output = _attributes->grad_output->to(device, device_id);
     auto grad_input = _attributes->grad_input->to(device, device_id);
     CHECK_OR(infiniopCreateSigmoidBackwardDescriptor(handle, &op_desc,
-                                         grad_input->desc(),
-                                         input->desc(),
-                                         grad_output->desc()),
+                                                     grad_input->desc(),
+                                                     input->desc(),
+                                                     grad_output->desc()),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
     size_t workspace_size;
     CHECK_OR(infiniopGetSigmoidBackwardWorkspaceSize(op_desc, &workspace_size),
@@ -65,10 +65,10 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     CHECK_OR(infinirtMalloc(&workspace, workspace_size),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
     CHECK_OR(infiniopSigmoidBackward(op_desc, workspace, workspace_size,
-                         grad_input->data(),
-                         input->data(),
-                         grad_output->data(),
-                         nullptr),
+                                     grad_input->data(),
+                                     input->data(),
+                                     grad_output->data(),
+                                     nullptr),
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
@@ -93,16 +93,16 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     return TEST_PASSED(elapsed_time);
 }
 
-std::vector<std::string> Test::attribute_names() { 
-    return {}; 
+std::vector<std::string> Test::attribute_names() {
+    return {};
 }
 
-std::vector<std::string> Test::tensor_names() { 
-    return {"input", "grad_output", "grad_input", "ans"}; 
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "grad_output", "grad_input", "ans"};
 }
 
-std::vector<std::string> Test::output_names() { 
-    return {"grad_input"}; 
+std::vector<std::string> Test::output_names() {
+    return {"grad_input"};
 }
 
 std::string Test::toString() const {
@@ -119,4 +119,4 @@ std::string Test::toString() const {
 Test::~Test() {
     delete _attributes;
 }
-}  // namespace infiniop_test::sigmoid_backward
+} // namespace infiniop_test::sigmoid_backward
diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp
index db256c283..e1406e588 100644
--- a/src/infiniop-test/src/ops/sin.cpp
+++ b/src/infiniop-test/src/ops/sin.cpp
@@ -86,16 +86,16 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     return TEST_PASSED(elapsed_time);
 }
 
-std::vector<std::string> Test::attribute_names() { 
-    return {}; 
+std::vector<std::string> Test::attribute_names() {
+    return {};
 }
 
-std::vector<std::string> Test::tensor_names() { 
-    return {"input", "output", "ans"}; 
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
 }
 
-std::vector<std::string> Test::output_names() { 
-    return {"output"}; 
+std::vector<std::string> Test::output_names() {
+    return {"output"};
 }
 
 std::string Test::toString() const {
@@ -111,4 +111,4 @@ std::string Test::toString() const {
 Test::~Test() {
     delete _attributes;
 }
-}  // namespace infiniop_test::sin
+} // namespace infiniop_test::sin
diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp
index bb8c6b081..6aeb3c301 100644
--- a/src/infiniop-test/src/ops/tanh.cpp
+++ b/src/infiniop-test/src/ops/tanh.cpp
@@ -50,8 +50,8 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     auto input = _attributes->input->to(device, device_id);
     auto output = _attributes->output->to(device, device_id);
     CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc,
-                                         output->desc(),
-                                         input->desc()),
+                                          output->desc(),
+                                          input->desc()),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
     size_t workspace_size;
     CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size),
@@ -60,9 +60,9 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     CHECK_OR(infinirtMalloc(&workspace, workspace_size),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
     CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size,
-                         output->data(),
-                         input->data(),
-                         nullptr),
+                          output->data(),
+                          input->data(),
+                          nullptr),
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
@@ -86,16 +86,16 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     return TEST_PASSED(elapsed_time);
 }
 
-std::vector<std::string> Test::attribute_names() { 
-    return {}; 
+std::vector<std::string> Test::attribute_names() {
+    return {};
 }
 
-std::vector<std::string> Test::tensor_names() { 
-    return {"input", "output", "ans"}; 
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
 }
 
-std::vector<std::string> Test::output_names() { 
-    return {"output"}; 
+std::vector<std::string> Test::output_names() {
+    return {"output"};
 }
 
 std::string Test::toString() const {
@@ -111,4 +111,4 @@ std::string Test::toString() const {
 Test::~Test() {
     delete _attributes;
 }
-}  // namespace infiniop_test::tanh
+} // namespace infiniop_test::tanh
diff --git a/src/infiniop-test/src/ops/where.cpp b/src/infiniop-test/src/ops/where.cpp
new file mode 100644
index 000000000..fea9cba92
--- /dev/null
+++ b/src/infiniop-test/src/ops/where.cpp
@@ -0,0 +1,151 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::where {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> condition;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("condition") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->condition = tensors["condition"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->a->ggml_type();
+    if (elemType == GGML_TYPE_I8) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I16) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I32) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I64) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_F64) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopWhereDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto condition = _attributes->condition->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateWhereDescriptor(handle, &op_desc,
+                                           c->desc(),
+                                           a->desc(),
+                                           b->desc(),
+                                           condition->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetWhereWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopWhere(op_desc, workspace, workspace_size,
+                           c->data(),
+                           a->data(),
+                           b->data(),
+                           condition->data(),
+                           nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopWhere(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                condition->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "condition", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- condition: " << _attributes->condition->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::where
diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc
index 9a8bcc5a5..36d2e9e28 100644
--- a/src/infiniop/ops/cast/cpu/cast_cpu.cc
+++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc
@@ -46,7 +46,9 @@ static inline void cpu_cast_impl_incremental(
     const std::vector<ptrdiff_t> &in_stride = info.in_stride;
     const std::vector<ptrdiff_t> &out_stride = info.out_stride;
 
-    if (n == 0) return;
+    if (n == 0) {
+        return;
+    }
 
     std::vector<size_t> idx(ndim, 0);
     ptrdiff_t in_off = 0;
@@ -59,15 +61,23 @@ static inline void cpu_cast_impl_incremental(
 
         for (int d = static_cast<int>(ndim) - 1; d >= 0; --d) {
             idx[d] += 1;
-            if (in_stride[d] != 0) in_off += in_stride[d];
-            if (out_stride[d] != 0) out_off += out_stride[d];
+            if (in_stride[d] != 0) {
+                in_off += in_stride[d];
+            }
+            if (out_stride[d] != 0) {
+                out_off += out_stride[d];
+            }
 
             if (idx[d] < shape[d]) {
                 break;
             } else {
                 idx[d] = 0;
-                if (in_stride[d] != 0) in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
-                if (out_stride[d] != 0) out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
+                if (in_stride[d] != 0) {
+                    in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
+                }
+                if (out_stride[d] != 0) {
+                    out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
+                }
             }
         }
     }
@@ -80,39 +90,39 @@ infiniStatus_t Descriptor::calculate(
     const void *input,
     void *stream) const {
 
-    if (output == const_cast<void*>(input)) {
+    if (output == const_cast<void *>(input)) {
         return INFINI_STATUS_BAD_PARAM; // or INFINI_STATUS_INPLACE_NOT_SUPPORTED
     }
 
-    #define CASE_OUT(DT_OUT, TOUT)                                    \
-        case DT_OUT: {                                                \
-            switch (_info.dt_in) {                                    \
-            case INFINI_DTYPE_I32:                                    \
-                cpu_cast_impl_incremental<TOUT, int32_t>(output, input, _info); \
-                break;                                                \
-            case INFINI_DTYPE_I64:                                    \
-                cpu_cast_impl_incremental<TOUT, int64_t>(output, input, _info); \
-                break;                                                \
-            case INFINI_DTYPE_U32:                                    \
-                cpu_cast_impl_incremental<TOUT, uint32_t>(output, input, _info); \
-                break;                                                \
-            case INFINI_DTYPE_U64:                                    \
-                cpu_cast_impl_incremental<TOUT, uint64_t>(output, input, _info); \
-                break;                                                \
-            case INFINI_DTYPE_F16:                                    \
-                cpu_cast_impl_incremental<TOUT, fp16_t>(output, input, _info); \
-                break;                                                \
-            case INFINI_DTYPE_F32:                                    \
-                cpu_cast_impl_incremental<TOUT, float>(output, input, _info); \
-                break;                                                \
-            case INFINI_DTYPE_F64:                                    \
-                cpu_cast_impl_incremental<TOUT, double>(output, input, _info); \
-                break;                                                \
-            default:                                                  \
-                return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                 \
-            }                                                         \
-            break;                                                    \
-        }
+#define CASE_OUT(DT_OUT, TOUT)                                               \
+    case DT_OUT: {                                                           \
+        switch (_info.dt_in) {                                               \
+        case INFINI_DTYPE_I32:                                               \
+            cpu_cast_impl_incremental<TOUT, int32_t>(output, input, _info);  \
+            break;                                                           \
+        case INFINI_DTYPE_I64:                                               \
+            cpu_cast_impl_incremental<TOUT, int64_t>(output, input, _info);  \
+            break;                                                           \
+        case INFINI_DTYPE_U32:                                               \
+            cpu_cast_impl_incremental<TOUT, uint32_t>(output, input, _info); \
+            break;                                                           \
+        case INFINI_DTYPE_U64:                                               \
+            cpu_cast_impl_incremental<TOUT, uint64_t>(output, input, _info); \
+            break;                                                           \
+        case INFINI_DTYPE_F16:                                               \
+            cpu_cast_impl_incremental<TOUT, fp16_t>(output, input, _info);   \
+            break;                                                           \
+        case INFINI_DTYPE_F32:                                               \
+            cpu_cast_impl_incremental<TOUT, float>(output, input, _info);    \
+            break;                                                           \
+        case INFINI_DTYPE_F64:                                               \
+            cpu_cast_impl_incremental<TOUT, double>(output, input, _info);   \
+            break;                                                           \
+        default:                                                             \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                  \
+        }                                                                    \
+        break;                                                               \
+    }
 
     switch (_info.dt_out) {
         CASE_OUT(INFINI_DTYPE_I32, int32_t);
@@ -126,10 +136,9 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
 
-    #undef CASE_OUT
+#undef CASE_OUT
 
     return INFINI_STATUS_SUCCESS;
 }
 
-
 } // namespace op::cast::cpu
diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh
index eee801b12..3736442a3 100644
--- a/src/infiniop/ops/cast/cuda/kernel.cuh
+++ b/src/infiniop/ops/cast/cuda/kernel.cuh
@@ -65,8 +65,12 @@ __global__ void cast_kernel(
             } else {
                 idx_d = 0;
             }
-            if (in_stride[d] != 0) in_off += static_cast<long long>(idx_d) * in_stride[d];
-            if (out_stride[d] != 0) out_off += static_cast<long long>(idx_d) * out_stride[d];
+            if (in_stride[d] != 0) {
+                in_off += static_cast<long long>(idx_d) * in_stride[d];
+            }
+            if (out_stride[d] != 0) {
+                out_off += static_cast<long long>(idx_d) * out_stride[d];
+            }
         }
         out[static_cast<size_t>(out_off)] = device_cast<ToutDev, TinDev>(in[static_cast<size_t>(in_off)]);
     }
diff --git a/src/infiniop/ops/cast/info.h b/src/infiniop/ops/cast/info.h
index 4283a8224..8f85f6da8 100644
--- a/src/infiniop/ops/cast/info.h
+++ b/src/infiniop/ops/cast/info.h
@@ -10,7 +10,7 @@ namespace op::cast {
 class CastInfo {
     CastInfo() = default;
 
-public:    
+public:
     infiniDtype_t dt_in;
     infiniDtype_t dt_out;
     std::vector<size_t> shape;
@@ -21,9 +21,9 @@ class CastInfo {
     static utils::Result<CastInfo> create(
         infiniopTensorDescriptor_t out_desc,
         infiniopTensorDescriptor_t in_desc) {
-        
+
         auto dt_out = out_desc->dtype();
-        auto dt_in  = in_desc->dtype();
+        auto dt_in = in_desc->dtype();
 
         CHECK_DTYPE(dt_in,
                     INFINI_DTYPE_I32, INFINI_DTYPE_I64,
@@ -40,14 +40,16 @@ class CastInfo {
         }
 
         size_t n = 1;
-        for (size_t i = 0; i < in_desc->ndim(); ++i) n *= static_cast<size_t>(in_desc->dim(i));
+        for (size_t i = 0; i < in_desc->ndim(); ++i) {
+            n *= static_cast<size_t>(in_desc->dim(i));
+        }
 
         return utils::Result<CastInfo>(CastInfo{
-            dt_in, 
-            dt_out, 
-            out_desc->shape(), 
-            in_desc->strides(), 
-            out_desc->strides(), 
+            dt_in,
+            dt_out,
+            out_desc->shape(),
+            in_desc->strides(),
+            out_desc->strides(),
             n,
         });
     }
diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
index 2ad20c203..8e7eea473 100644
--- a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
+++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
@@ -1,12 +1,12 @@
-#include "../cuda/kernel.cuh"
 #include "../../../devices/nvidia/nvidia_handle.cuh"
 #include "../cast.h"
-#include "cast_nvidia.cuh"
+#include "../cuda/kernel.cuh"
 #include "../info.h"
-#include <cuda_runtime.h>
+#include "cast_nvidia.cuh"
 #include <algorithm>
-#include <vector>
 #include <cstring>
+#include <cuda_runtime.h>
+#include <vector>
 
 namespace op::cast::nvidia {
 
@@ -18,8 +18,14 @@ Descriptor::~Descriptor() {
     delete _opaque;
 }
 
-template <typename T> struct MapCudaType { using Type = T; };
-template <> struct MapCudaType<fp16_t> { using Type = half; };
+template <typename T>
+struct MapCudaType {
+    using Type = T;
+};
+template <>
+struct MapCudaType<fp16_t> {
+    using Type = half;
+};
 
 infiniStatus_t Descriptor::create(
     infiniopHandle_t handle_,
@@ -49,18 +55,18 @@ size_t Descriptor::workspaceSize() const {
 
 template <class ToutHost, class TinHost>
 static inline infiniStatus_t cuda_cast_impl_incremental(
-    void *output_, const void *input_, 
-    const op::cast::CastInfo &info, 
+    void *output_, const void *input_,
+    const op::cast::CastInfo &info,
     void *stream_) {
 
     int bs = 256, grid = 0;
     cudaError_t propErr;
     int device_id_local = 0;
     using DevTout = typename MapCudaType<ToutHost>::Type;
-    using DevTin  = typename MapCudaType<TinHost>::Type;
+    using DevTin = typename MapCudaType<TinHost>::Type;
 
     auto out_dev = reinterpret_cast<DevTout *>(output_);
-    auto in_dev  = reinterpret_cast<const DevTin *>(input_);
+    auto in_dev = reinterpret_cast<const DevTin *>(input_);
     auto stream = reinterpret_cast<cudaStream_t>(stream_);
 
     int ndim = static_cast<int>(info.shape.size());
@@ -88,22 +94,38 @@ static inline infiniStatus_t cuda_cast_impl_incremental(
 
     cudaError_t err = cudaSuccess;
     err = cudaMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
 
     err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
 
     device_id_local = 0;
     propErr = cudaGetDevice(&device_id_local);
@@ -112,24 +134,36 @@ static inline infiniStatus_t cuda_cast_impl_incremental(
         if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) {
             bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
         } else {
-            if (bs > 256) bs = 256;
+            if (bs > 256) {
+                bs = 256;
+            }
         }
     } else {
-        if (bs > 256) bs = 256;
+        if (bs > 256) {
+            bs = 256;
+        }
     }
 
-    if (bs <= 0) bs = 256;
+    if (bs <= 0) {
+        bs = 256;
+    }
     grid = static_cast<int>((info.n + bs - 1) / bs);
-    if (grid <= 0) grid = 1;
+    if (grid <= 0) {
+        grid = 1;
+    }
 
     cast_kernel<DevTout, DevTin><<<grid, bs, 0, stream>>>(
         out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim);
 
     err = cudaGetLastError();
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
 
     err = cudaStreamSynchronize(stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
 
     cudaFree(d_shape);
     cudaFree(d_div);
@@ -152,39 +186,39 @@ infiniStatus_t Descriptor::calculate(
     const void *input,
     void *stream) const {
 
-    if (output == const_cast<void*>(input)) {
+    if (output == const_cast<void *>(input)) {
         return INFINI_STATUS_BAD_PARAM;
     }
 
-    #define CASE_OUT(DT_OUT, TOUT)                                    \
-        case DT_OUT: {                                                \
-            switch (_info.dt_in) {                                    \
-            case INFINI_DTYPE_I32:                                    \
-                cuda_cast_impl_incremental<TOUT, int32_t>(output, input, _info, stream); \
-                break;                                                \
-            case INFINI_DTYPE_I64:                                    \
-                cuda_cast_impl_incremental<TOUT, int64_t>(output, input, _info, stream); \
-                break;                                                \
-            case INFINI_DTYPE_U32:                                    \
-                cuda_cast_impl_incremental<TOUT, uint32_t>(output, input, _info, stream); \
-                break;                                                \
-            case INFINI_DTYPE_U64:                                    \
-                cuda_cast_impl_incremental<TOUT, uint64_t>(output, input, _info, stream); \
-                break;                                                \
-            case INFINI_DTYPE_F16:                                    \
-                cuda_cast_impl_incremental<TOUT, fp16_t>(output, input, _info, stream); \
-                break;                                                \
-            case INFINI_DTYPE_F32:                                    \
-                cuda_cast_impl_incremental<TOUT, float>(output, input, _info, stream); \
-                break;                                                \
-            case INFINI_DTYPE_F64:                                    \
-                cuda_cast_impl_incremental<TOUT, double>(output, input, _info, stream); \
-                break;                                                \
-            default:                                                  \
-                return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                 \
-            }                                                         \
-            break;                                                    \
-        }
+#define CASE_OUT(DT_OUT, TOUT)                                                        \
+    case DT_OUT: {                                                                    \
+        switch (_info.dt_in) {                                                        \
+        case INFINI_DTYPE_I32:                                                        \
+            cuda_cast_impl_incremental<TOUT, int32_t>(output, input, _info, stream);  \
+            break;                                                                    \
+        case INFINI_DTYPE_I64:                                                        \
+            cuda_cast_impl_incremental<TOUT, int64_t>(output, input, _info, stream);  \
+            break;                                                                    \
+        case INFINI_DTYPE_U32:                                                        \
+            cuda_cast_impl_incremental<TOUT, uint32_t>(output, input, _info, stream); \
+            break;                                                                    \
+        case INFINI_DTYPE_U64:                                                        \
+            cuda_cast_impl_incremental<TOUT, uint64_t>(output, input, _info, stream); \
+            break;                                                                    \
+        case INFINI_DTYPE_F16:                                                        \
+            cuda_cast_impl_incremental<TOUT, fp16_t>(output, input, _info, stream);   \
+            break;                                                                    \
+        case INFINI_DTYPE_F32:                                                        \
+            cuda_cast_impl_incremental<TOUT, float>(output, input, _info, stream);    \
+            break;                                                                    \
+        case INFINI_DTYPE_F64:                                                        \
+            cuda_cast_impl_incremental<TOUT, double>(output, input, _info, stream);   \
+            break;                                                                    \
+        default:                                                                      \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                           \
+        }                                                                             \
+        break;                                                                        \
+    }
 
     switch (_info.dt_out) {
         CASE_OUT(INFINI_DTYPE_I32, int32_t);
@@ -198,7 +232,7 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
 
-    #undef CASE_OUT
+#undef CASE_OUT
     return INFINI_STATUS_SUCCESS;
 }
 
diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc
index fc3aef4ad..12d26953b 100644
--- a/src/infiniop/ops/cast/operator.cc
+++ b/src/infiniop/ops/cast/operator.cc
@@ -24,7 +24,7 @@ __C infiniStatus_t infiniopCreateCastDescriptor(
             handle,                                                         \
             reinterpret_cast<op::cast::NAMESPACE::Descriptor **>(desc_ptr), \
             output_desc,                                                    \
-            input_desc)                                                     \
+            input_desc)
 
     switch (handle->device) {
 
@@ -50,8 +50,8 @@ __C infiniStatus_t infiniopCreateCastDescriptor(
 
 __C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
         *size = reinterpret_cast<op::cast::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -84,8 +84,8 @@ __C infiniStatus_t infiniopCast(
     const void *input,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
         return reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, output, input, stream)
 
@@ -114,9 +114,9 @@ __C infiniStatus_t infiniopCast(
 __C infiniStatus_t
 infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-    delete reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc); \
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h
index 37efb7597..af324eb80 100644
--- a/src/infiniop/ops/cos/cpu/cos_cpu.h
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.h
@@ -1,8 +1,8 @@
 #ifndef __COS_CPU_H__
 #define __COS_CPU_H__
 
-#include <cmath>
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
 
 ELEMENTWISE_DESCRIPTOR(cos, cpu)
 
diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh
index 381a897f0..5db7ee8f6 100644
--- a/src/infiniop/ops/cos/cuda/kernel.cuh
+++ b/src/infiniop/ops/cos/cuda/kernel.cuh
@@ -1,48 +1,47 @@
 #ifndef __COS_CUDA_H__
 #define __COS_CUDA_H__
 
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 namespace op::cos::cuda {
 typedef struct CosOp {
-  static constexpr size_t num_inputs = 1;
+    static constexpr size_t num_inputs = 1;
 
-  template <typename T>
-  __device__ __forceinline__ T operator()(const T &input) const {
-    auto cos_f32 = [] __device__ (float x) {
-        double xd = static_cast<double>(x);
-        double yd = std::cos(xd);
-        return static_cast<float>(yd);
-    };
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        auto cos_f32 = [] __device__(float x) {
+            double xd = static_cast<double>(x);
+            double yd = std::cos(xd);
+            return static_cast<float>(yd);
+        };
 
-    if constexpr (std::is_same_v<T, half2>) {
-        float2 vf = __half22float2(input);
-        float2 vr = make_float2(
-          cos_f32(vf.x),
-          cos_f32(vf.y)
-        );
-        return __float22half2_rn(vr);
-    } else if constexpr (std::is_same_v<T, half>) {
-        float xf = __half2float(input);
-        float yf = cos_f32(xf);
-        return __float2half_rn(yf);
-    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-        float f0 = __bfloat162float(__low2bfloat16(input));
-        float f1 = __bfloat162float(__high2bfloat16(input));
-        return __floats2bfloat162_rz(cos_f32(f0), cos_f32(f1));
-    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-        float xf = __bfloat162float(input);
-        return __float2bfloat16_rz(cos_f32(xf));
-    } else if constexpr (std::is_same_v<T, float>) {
-        return cos_f32(input);
-    } else if constexpr (std::is_same_v<T, double>) {
-        return std::cos(input);
-    } else {
-        return std::cos(input);
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(
+                cos_f32(vf.x),
+                cos_f32(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = cos_f32(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rz(cos_f32(f0), cos_f32(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            return __float2bfloat16_rz(cos_f32(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return cos_f32(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::cos(input);
+        } else {
+            return std::cos(input);
+        }
     }
-  }
 } CosOp;
 } // namespace op::cos::cuda
 
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
index a3c38bc89..433363c91 100644
--- a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
     const auto &input_desc = input_desc_vec.at(0);
     const auto &output_shape = out_desc->shape();
     const auto &input_shape = input_desc->shape();
-    
+
     CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
 
     CHECK_SAME_SHAPE(output_shape, input_shape);
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
index 11781d591..71a5f807c 100644
--- a/src/infiniop/ops/cos/operator.cc
+++ b/src/infiniop/ops/cos/operator.cc
@@ -18,13 +18,13 @@ __C infiniStatus_t infiniopCreateCosDescriptor(
     infiniopTensorDescriptor_t output_desc,
     infiniopTensorDescriptor_t input_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::cos::NAMESPACE::Descriptor::create(                      \
-            handle,                                                         \
-            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr),  \
-            output_desc,                                                    \
-            {input_desc})                                                   \
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::cos::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
 
     switch (handle->device) {
 
@@ -114,9 +114,9 @@ __C infiniStatus_t infiniopCos(
 __C infiniStatus_t
 infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-    delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h
index fbf9ab126..867c7afa5 100644
--- a/src/infiniop/ops/exp/cpu/exp_cpu.h
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.h
@@ -1,8 +1,8 @@
 #ifndef __EXP_CPU_H__
 #define __EXP_CPU_H__
 
-#include <cmath>
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
 
 ELEMENTWISE_DESCRIPTOR(exp, cpu)
 
diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh
index 316a393be..12446f31a 100644
--- a/src/infiniop/ops/exp/cuda/kernel.cuh
+++ b/src/infiniop/ops/exp/cuda/kernel.cuh
@@ -1,39 +1,39 @@
 #ifndef __EXP_CUDA_H__
 #define __EXP_CUDA_H__
 
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 namespace op::exp::cuda {
 typedef struct ExpOp {
-  static constexpr size_t num_inputs = 1;
+    static constexpr size_t num_inputs = 1;
 
-  template <typename T>
-  __device__ __forceinline__ T operator()(const T &input) const {
-    if constexpr (std::is_same_v<T, half2>) {
-        float2 vf = __half22float2(input);
-        float2 vr = make_float2(__expf(vf.x), __expf(vf.y));
-        return __float22half2_rn(vr);
-    } else if constexpr (std::is_same_v<T, half>) {
-        float inputf = __half2float(input);
-        return __float2half_rn(__expf(inputf));
-    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-        float f0 = __bfloat162float(__low2bfloat16(input));
-        float f1 = __bfloat162float(__high2bfloat16(input));
-        return __floats2bfloat162_rn(__expf(f0), __expf(f1));
-    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-        float inputf = __bfloat162float(input);
-        return __float2bfloat16_rn(__expf(inputf));
-    } else if constexpr (std::is_same_v<T, float>) {
-        return __expf(input);
-    } else if constexpr (std::is_same_v<T, double>) {
-        return std::exp(input);
-    } else {
-        return std::exp(input);
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(__expf(vf.x), __expf(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float inputf = __half2float(input);
+            return __float2half_rn(__expf(inputf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(__expf(f0), __expf(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float inputf = __bfloat162float(input);
+            return __float2bfloat16_rn(__expf(inputf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __expf(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::exp(input);
+        } else {
+            return std::exp(input);
+        }
     }
-  }
 } ExpOp;
-} // namespace
+} // namespace op::exp::cuda
 
 #endif // __EXP_CUDA_H__
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
index f4229a942..3bdf2eb45 100644
--- a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
     const auto &input_desc = input_desc_vec.at(0);
     const auto &output_shape = out_desc->shape();
     const auto &input_shape = input_desc->shape();
-    
+
     CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
 
     CHECK_SAME_SHAPE(output_shape, input_shape);
diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc
index 56f5d29cd..ee1dc6768 100644
--- a/src/infiniop/ops/exp/operator.cc
+++ b/src/infiniop/ops/exp/operator.cc
@@ -18,13 +18,13 @@ __C infiniStatus_t infiniopCreateExpDescriptor(
     infiniopTensorDescriptor_t output_desc,
     infiniopTensorDescriptor_t input_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::exp::NAMESPACE::Descriptor::create(                      \
-            handle,                                                         \
-            reinterpret_cast<op::exp::NAMESPACE::Descriptor **>(desc_ptr),  \
-            output_desc,                                                         \
-            {input_desc})                                                       \
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::exp::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::exp::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
 
     switch (handle->device) {
 
@@ -114,9 +114,9 @@ __C infiniStatus_t infiniopExp(
 __C infiniStatus_t
 infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-    delete reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc); \
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
index a42009017..e137be8a0 100644
--- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
@@ -1,8 +1,8 @@
 #ifndef __HARDSWISH_CPU_H__
 #define __HARDSWISH_CPU_H__
 
-#include <algorithm>
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
 
 ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
 
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
index be22e5faa..d5b369bce 100644
--- a/src/infiniop/ops/hardswish/cuda/kernel.cuh
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -1,54 +1,53 @@
 #ifndef __HARDSWISH_CUDA_H__
 #define __HARDSWISH_CUDA_H__
 
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 namespace op::hardswish::cuda {
 
 typedef struct HardswishOp {
-  static constexpr size_t num_inputs = 1;
+    static constexpr size_t num_inputs = 1;
 
-  // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
-  __device__ __forceinline__ float hswish_f32(float x) const {
-    float y = x + 3.0f;
-    y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
-    return x * (y * (1.0f / 6.0f));
-  }
+    // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
+    __device__ __forceinline__ float hswish_f32(float x) const {
+        float y = x + 3.0f;
+        y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
+        return x * (y * (1.0f / 6.0f));
+    }
 
-  template <typename T>
-  __device__ __forceinline__ T operator()(const T &input) const {
-    if constexpr (std::is_same_v<T, half2>) {
-      float2 vf = __half22float2(input);
-      float2 vr = make_float2(
-        hswish_f32(vf.x),
-        hswish_f32(vf.y)
-      );
-      return __float22half2_rn(vr);
-    } else if constexpr (std::is_same_v<T, half>) {
-      float xf = __half2float(input);
-      float yf = hswish_f32(xf);
-      return __float2half_rn(yf);
-    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-      float f0 = __bfloat162float(__low2bfloat16(input));
-      float f1 = __bfloat162float(__high2bfloat16(input));
-      return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
-    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-      float xf = __bfloat162float(input);
-      return __float2bfloat16_rz(hswish_f32(xf));
-    } else if constexpr (std::is_same_v<T, float>) {
-      return hswish_f32(input);
-    } else if constexpr (std::is_same_v<T, double>) {
-      double xd = static_cast<double>(input);
-      double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
-      return static_cast<T>(yd);
-    } else {
-      double xd = static_cast<double>(input);
-      double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
-      return static_cast<T>(yd);
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(
+                hswish_f32(vf.x),
+                hswish_f32(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = hswish_f32(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            return __float2bfloat16_rz(hswish_f32(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return hswish_f32(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            double xd = static_cast<double>(input);
+            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+            return static_cast<T>(yd);
+        } else {
+            double xd = static_cast<double>(input);
+            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+            return static_cast<T>(yd);
+        }
     }
-  }
 } HardswishOp;
 
 } // namespace op::hardswish::cuda
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
index 0aff55cd2..9e279c2ef 100644
--- a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
     const auto &input_desc = input_desc_vec.at(0);
     const auto &output_shape = out_desc->shape();
     const auto &input_shape = input_desc->shape();
-    
+
     CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
 
     CHECK_SAME_SHAPE(output_shape, input_shape);
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
index 7787c799b..e8ba19fc1 100644
--- a/src/infiniop/ops/hardswish/operator.cc
+++ b/src/infiniop/ops/hardswish/operator.cc
@@ -18,13 +18,13 @@ __C infiniStatus_t infiniopCreateHardswishDescriptor(
     infiniopTensorDescriptor_t output_desc,
     infiniopTensorDescriptor_t input_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::hardswish::NAMESPACE::Descriptor::create(                      \
-            handle,                                                         \
-            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr),  \
-            output_desc,                                                    \
-            {input_desc})                                                   \
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::hardswish::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                         \
+            {input_desc})
 
     switch (handle->device) {
 
@@ -50,8 +50,8 @@ __C infiniStatus_t infiniopCreateHardswishDescriptor(
 
 __C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
         *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -84,8 +84,8 @@ __C infiniStatus_t infiniopHardswish(
     const void *input,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
         return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, output, {input}, stream)
 
@@ -114,9 +114,9 @@ __C infiniStatus_t infiniopHardswish(
 __C infiniStatus_t
 infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-    delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc
index cd56f0ca6..c10a44cb5 100644
--- a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc
+++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc
@@ -40,7 +40,9 @@ static inline void cpu_leakyrelu_impl_incremental(
     const size_t ndim = info.shape.size();
     const size_t n = info.n;
 
-    if (n == 0) return;
+    if (n == 0) {
+        return;
+    }
 
     auto out_base = reinterpret_cast<T *>(output);
     auto in_base = reinterpret_cast<const T *>(input);
@@ -62,15 +64,23 @@ static inline void cpu_leakyrelu_impl_incremental(
         *out_elem = utils::cast<T, float>(outv);
         for (int d = static_cast<int>(ndim) - 1; d >= 0; --d) {
             idx[d] += 1;
-            if (in_stride[d] != 0) in_off += in_stride[d];
-            if (out_stride[d] != 0) out_off += out_stride[d];
+            if (in_stride[d] != 0) {
+                in_off += in_stride[d];
+            }
+            if (out_stride[d] != 0) {
+                out_off += out_stride[d];
+            }
 
             if (idx[d] < shape[d]) {
                 break;
             } else {
                 idx[d] = 0;
-                if (in_stride[d] != 0) in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
-                if (out_stride[d] != 0) out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
+                if (in_stride[d] != 0) {
+                    in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
+                }
+                if (out_stride[d] != 0) {
+                    out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
+                }
             }
         }
     }
@@ -83,22 +93,22 @@ infiniStatus_t Descriptor::calculate(
     const void *input,
     void *stream) const {
 
-    switch (_info.dt_in) {     
+    switch (_info.dt_in) {
     case INFINI_DTYPE_F16:
         cpu_leakyrelu_impl_incremental<fp16_t>(output, input, _info);
-        break;             
+        break;
     case INFINI_DTYPE_BF16:
         cpu_leakyrelu_impl_incremental<bf16_t>(output, input, _info);
-        break;             
-    case INFINI_DTYPE_F32: 
+        break;
+    case INFINI_DTYPE_F32:
         cpu_leakyrelu_impl_incremental<float>(output, input, _info);
-        break;            
-    case INFINI_DTYPE_F64: 
+        break;
+    case INFINI_DTYPE_F64:
         cpu_leakyrelu_impl_incremental<double>(output, input, _info);
-        break;            
-    default:               
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;    
-    }                      
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
     return INFINI_STATUS_SUCCESS;
 }
 } // namespace op::leakyrelu::cpu
diff --git a/src/infiniop/ops/leakyrelu/cuda/kernel.cuh b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh
index abad71b6a..afca17002 100644
--- a/src/infiniop/ops/leakyrelu/cuda/kernel.cuh
+++ b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh
@@ -1,8 +1,8 @@
 #ifndef __LEAKYRELU_CUDA_KERNEL_CUH__
 #define __LEAKYRELU_CUDA_KERNEL_CUH__
 
-#include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
 #include <stdint.h>
 #include <type_traits>
 
@@ -56,8 +56,12 @@ __global__ void leakyrelu_kernel(
             } else {
                 idx_d = 0;
             }
-            if (in_stride[d] != 0) in_off += static_cast<long long>(idx_d) * in_stride[d];
-            if (out_stride[d] != 0) out_off += static_cast<long long>(idx_d) * out_stride[d];
+            if (in_stride[d] != 0) {
+                in_off += static_cast<long long>(idx_d) * in_stride[d];
+            }
+            if (out_stride[d] != 0) {
+                out_off += static_cast<long long>(idx_d) * out_stride[d];
+            }
         }
 
         float v = to_float_for_leaky(in[static_cast<size_t>(in_off)]);
diff --git a/src/infiniop/ops/leakyrelu/info.h b/src/infiniop/ops/leakyrelu/info.h
index dd0a2d3ad..1f074d85a 100644
--- a/src/infiniop/ops/leakyrelu/info.h
+++ b/src/infiniop/ops/leakyrelu/info.h
@@ -34,7 +34,9 @@ class LeakyReLUInfo {
         }
 
         size_t n = 1;
-        for (size_t i = 0; i < in_desc->ndim(); ++i) n *= static_cast<size_t>(in_desc->dim(i));
+        for (size_t i = 0; i < in_desc->ndim(); ++i) {
+            n *= static_cast<size_t>(in_desc->dim(i));
+        }
 
         return utils::Result<LeakyReLUInfo>(LeakyReLUInfo{
             dt_in,
@@ -42,8 +44,7 @@ class LeakyReLUInfo {
             in_desc->strides(),
             out_desc->strides(),
             n,
-            negative_slope
-        });
+            negative_slope});
     }
 };
 
diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu
index 05d149d5e..9b65bc421 100644
--- a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu
+++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu
@@ -1,13 +1,13 @@
-#include "../cuda/kernel.cuh"
 #include "../../../devices/nvidia/nvidia_handle.cuh"
 #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
 #include "../leakyrelu.h"
 #include "leakyrelu_nvidia.cuh"
-#include "../info.h"
-#include <cuda_runtime.h>
 #include <algorithm>
-#include <vector>
 #include <cstring>
+#include <cuda_runtime.h>
+#include <vector>
 
 namespace op::leakyrelu::nvidia {
 
@@ -19,10 +19,19 @@ Descriptor::~Descriptor() {
     delete _opaque;
 }
 
-template <typename T> struct MapCudaType { using Type = T; };
-template <> struct MapCudaType<fp16_t> { using Type = half; };
+template <typename T>
+struct MapCudaType {
+    using Type = T;
+};
+template <>
+struct MapCudaType<fp16_t> {
+    using Type = half;
+};
 #if defined(__CUDA_BF16_TYPES_EXIST__) || defined(__CUDA_ARCH__)
-template <> struct MapCudaType<bf16_t> { using Type = __nv_bfloat16; };
+template <>
+struct MapCudaType<bf16_t> {
+    using Type = __nv_bfloat16;
+};
 #endif
 
 infiniStatus_t Descriptor::create(
@@ -54,8 +63,8 @@ size_t Descriptor::workspaceSize() const {
 
 template <typename T>
 static inline infiniStatus_t cuda_leakyrelu_impl_incremental(
-    void *output_, const void *input_, 
-    const op::leakyrelu::LeakyReLUInfo &info, 
+    void *output_, const void *input_,
+    const op::leakyrelu::LeakyReLUInfo &info,
     void *stream_) {
 
     int bs = 256, grid = 0;
@@ -64,7 +73,7 @@ static inline infiniStatus_t cuda_leakyrelu_impl_incremental(
     using DevT = typename MapCudaType<T>::Type;
 
     auto out_dev = reinterpret_cast<DevT *>(output_);
-    auto in_dev  = reinterpret_cast<const DevT *>(input_);
+    auto in_dev = reinterpret_cast<const DevT *>(input_);
     auto stream = reinterpret_cast<cudaStream_t>(stream_);
 
     int ndim = static_cast<int>(info.shape.size());
@@ -93,21 +102,37 @@ static inline infiniStatus_t cuda_leakyrelu_impl_incremental(
     cudaError_t err = cudaSuccess;
 
     err = cudaMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
     err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
 
     device_id_local = 0;
     propErr = cudaGetDevice(&device_id_local);
@@ -116,24 +141,36 @@ static inline infiniStatus_t cuda_leakyrelu_impl_incremental(
         if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) {
             bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
         } else {
-            if (bs > 256) bs = 256;
+            if (bs > 256) {
+                bs = 256;
+            }
         }
     } else {
-        if (bs > 256) bs = 256;
+        if (bs > 256) {
+            bs = 256;
+        }
     }
 
-    if (bs <= 0) bs = 256;
+    if (bs <= 0) {
+        bs = 256;
+    }
     grid = static_cast<int>((info.n + bs - 1) / bs);
-    if (grid <= 0) grid = 1;
+    if (grid <= 0) {
+        grid = 1;
+    }
 
     leakyrelu_kernel<DevT><<<grid, bs, 0, stream>>>(
         out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim);
 
     err = cudaGetLastError();
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
 
     err = cudaStreamSynchronize(stream);
-    if (err != cudaSuccess) goto cleanup;
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
 
     cudaFree(d_shape);
     cudaFree(d_div);
@@ -156,22 +193,22 @@ infiniStatus_t Descriptor::calculate(
     const void *input,
     void *stream) const {
 
-    switch (_info.dt_in) {     
+    switch (_info.dt_in) {
     case INFINI_DTYPE_F16:
         cuda_leakyrelu_impl_incremental<fp16_t>(output, input, _info, stream);
-        break;             
+        break;
     case INFINI_DTYPE_BF16:
         cuda_leakyrelu_impl_incremental<cuda_bfloat16>(output, input, _info, stream);
-        break;             
-    case INFINI_DTYPE_F32: 
+        break;
+    case INFINI_DTYPE_F32:
         cuda_leakyrelu_impl_incremental<float>(output, input, _info, stream);
-        break;            
-    case INFINI_DTYPE_F64: 
+        break;
+    case INFINI_DTYPE_F64:
         cuda_leakyrelu_impl_incremental<double>(output, input, _info, stream);
-        break;            
-    default:               
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;    
-    }                      
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
     return INFINI_STATUS_SUCCESS;
 }
 
diff --git a/src/infiniop/ops/leakyrelu/operator.cc b/src/infiniop/ops/leakyrelu/operator.cc
index ad6d504a8..3f78a4916 100644
--- a/src/infiniop/ops/leakyrelu/operator.cc
+++ b/src/infiniop/ops/leakyrelu/operator.cc
@@ -19,13 +19,13 @@ __C infiniStatus_t infiniopCreateLeakyreluDescriptor(
     infiniopTensorDescriptor_t x_desc,
     float negative_slope) {
 
-#define CREATE_LEAKY(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                          \
-        return op::leakyrelu::NAMESPACE::Descriptor::create(                            \
-            handle,                                                                     \
-            reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor **>(desc_ptr),        \
-            y_desc,                                                                     \
-            x_desc,                                                                     \
+#define CREATE_LEAKY(CASE, NAMESPACE)                                            \
+    case CASE:                                                                   \
+        return op::leakyrelu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                              \
+            x_desc,                                                              \
             negative_slope)
 
     switch (handle->device) {
@@ -58,8 +58,8 @@ __C infiniStatus_t infiniopCreateLeakyreluDescriptor(
 
 __C infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                    \
-    case CASE:                                                                                  \
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
         *size = reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS
 
@@ -95,8 +95,8 @@ __C infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor
 __C infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, void *workspace, size_t workspace_size,
                                      void *y, const void *x, void *stream) {
 
-#define CALC_LEAKY(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                        \
+#define CALC_LEAKY(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                            \
         return reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc)->calculate( \
             workspace, workspace_size, y, x, stream)
 
@@ -130,9 +130,9 @@ __C infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, void *w
 
 __C infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc) {
 
-#define DESTROY_LEAKY(CASE, NAMESPACE)                                              \
-    case CASE:                                                                      \
-        delete reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc);     \
+#define DESTROY_LEAKY(CASE, NAMESPACE)                                         \
+    case CASE:                                                                 \
+        delete reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
index b2f87c2ea..32537ef17 100644
--- a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
+++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
@@ -11,9 +11,8 @@ typedef struct SigmoidBackwardOp {
     static constexpr size_t num_inputs = 2;
     template <typename T>
     T operator()(const T &x, const T &grad_out) const {
-        using ComputeT =
-            std::conditional_t<std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>,
-                               float, T>;
+        using ComputeT = std::conditional_t<std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>,
+                                            float, T>;
         ComputeT xv = utils::cast<ComputeT, T>(x);
         ComputeT gov = utils::cast<ComputeT, T>(grad_out);
 
diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
index 6c10dd26e..42c850004 100644
--- a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
+++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
@@ -1,10 +1,10 @@
 #ifndef __SIGMOID_BACKWARD_CUDA_H__
 #define __SIGMOID_BACKWARD_CUDA_H__
 
-#include <type_traits>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
 
 namespace op::sigmoid_backward::cuda {
 typedef struct SigmoidBackwardOp {
diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc
index f30a646d0..40a279f4b 100644
--- a/src/infiniop/ops/sigmoid_backward/operator.cc
+++ b/src/infiniop/ops/sigmoid_backward/operator.cc
@@ -19,13 +19,13 @@ __C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(
     infiniopTensorDescriptor_t input_desc,
     infiniopTensorDescriptor_t grad_output_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
+#define CREATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                          \
         return op::sigmoid_backward::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
+            handle,                                                                     \
             reinterpret_cast<op::sigmoid_backward::NAMESPACE::Descriptor **>(desc_ptr), \
-            grad_input_desc,                                                        \
-            {input_desc,                                                       \
+            grad_input_desc,                                                            \
+            {input_desc,                                                                \
              grad_output_desc})
 
     switch (handle->device) {
@@ -52,8 +52,8 @@ __C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(
 
 __C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
+#define GET(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                          \
         *size = reinterpret_cast<op::sigmoid_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS
 
@@ -87,8 +87,8 @@ __C infiniStatus_t infiniopSigmoidBackward(
     const void *grad_output,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
+#define CALCULATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                             \
         return reinterpret_cast<const op::sigmoid_backward::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
 
@@ -117,8 +117,8 @@ __C infiniStatus_t infiniopSigmoidBackward(
 __C infiniStatus_t
 infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
+#define DELETE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                              \
         delete reinterpret_cast<const op::sigmoid_backward::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h
index e221c2573..80e406f98 100644
--- a/src/infiniop/ops/sin/cpu/sin_cpu.h
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.h
@@ -1,8 +1,8 @@
 #ifndef __SIN_CPU_H__
 #define __SIN_CPU_H__
 
-#include <cmath>
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
 
 ELEMENTWISE_DESCRIPTOR(sin, cpu)
 
diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh
index c9993ca12..30641366c 100644
--- a/src/infiniop/ops/sin/cuda/kernel.cuh
+++ b/src/infiniop/ops/sin/cuda/kernel.cuh
@@ -1,38 +1,38 @@
 #ifndef __SIN_CUDA_H__
 #define __SIN_CUDA_H__
 
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 namespace op::sin::cuda {
 typedef struct SinOp {
-  static constexpr size_t num_inputs = 1;
+    static constexpr size_t num_inputs = 1;
 
-  template <typename T>
-  __device__ __forceinline__ T operator()(const T &input) const {
-    if constexpr (std::is_same_v<T, half2>) {
-        float2 vf = __half22float2(input);
-        float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y));
-        return __float22half2_rn(vr);
-    } else if constexpr (std::is_same_v<T, half>) {
-        float inputf = __half2float(input);
-        return __float2half_rn(sinf(inputf));
-    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-        float f0 = __bfloat162float(__low2bfloat16(input));
-        float f1 = __bfloat162float(__high2bfloat16(input));
-        return __floats2bfloat162_rn(__sinf(f0), __sinf(f1));
-    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-        float inputf = __bfloat162float(input);
-        return __float2bfloat16_rn(__sinf(inputf));
-    } else if constexpr (std::is_same_v<T, float>) {
-        return sinf(input);
-    } else if constexpr (std::is_same_v<T, double>) {
-        return std::sin(input);
-    } else {
-        return std::sin(input);
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float inputf = __half2float(input);
+            return __float2half_rn(sinf(inputf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(__sinf(f0), __sinf(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float inputf = __bfloat162float(input);
+            return __float2bfloat16_rn(__sinf(inputf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return sinf(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::sin(input);
+        } else {
+            return std::sin(input);
+        }
     }
-  }
 } SinOp;
 } // namespace op::sin::cuda
 
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
index eaac7a582..6fbf952bc 100644
--- a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
     const auto &input_desc = input_desc_vec.at(0);
     const auto &output_shape = out_desc->shape();
     const auto &input_shape = input_desc->shape();
-    
+
     CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
 
     CHECK_SAME_SHAPE(output_shape, input_shape);
diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc
index 38d8b242c..978561a04 100644
--- a/src/infiniop/ops/sin/operator.cc
+++ b/src/infiniop/ops/sin/operator.cc
@@ -18,13 +18,13 @@ __C infiniStatus_t infiniopCreateSinDescriptor(
     infiniopTensorDescriptor_t output_desc,
     infiniopTensorDescriptor_t input_desc) {
 
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::sin::NAMESPACE::Descriptor::create(                      \
-            handle,                                                         \
-            reinterpret_cast<op::sin::NAMESPACE::Descriptor **>(desc_ptr),  \
-            output_desc,                                                    \
-            {input_desc})                                                   \
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::sin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::sin::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
 
     switch (handle->device) {
 
@@ -114,9 +114,9 @@ __C infiniStatus_t infiniopSin(
 __C infiniStatus_t
 infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-    delete reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc); \
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
index 5dc73b383..73fd7c1b6 100644
--- a/src/infiniop/ops/tanh/cpu/tanh_cpu.h
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
@@ -1,8 +1,8 @@
 #ifndef __TANH_CPU_H__
 #define __TANH_CPU_H__
 
-#include <cmath>
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
 
 ELEMENTWISE_DESCRIPTOR(tanh, cpu)
 
diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh
index 49605aa93..62979a20e 100644
--- a/src/infiniop/ops/tanh/cuda/kernel.cuh
+++ b/src/infiniop/ops/tanh/cuda/kernel.cuh
@@ -1,45 +1,45 @@
 #ifndef __TANH_CUDA_H__
 #define __TANH_CUDA_H__
 
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
 #include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 namespace op::tanh::cuda {
 typedef struct TanhOp {
-  static constexpr size_t num_inputs = 1;
+    static constexpr size_t num_inputs = 1;
 
-  __device__ __forceinline__ float tanh_f32_func(float x) const {
-    return tanhf(x);
-  }
-  template <typename T>
-  __device__ __forceinline__ T operator()(const T &input) const {
-    if constexpr (std::is_same_v<T, half2>) {
-        float2 vf = __half22float2(input);
-        float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
-        return __float22half2_rn(vr);
-    } else if constexpr (std::is_same_v<T, half>) {
-        float xf = __half2float(input);
-        float yf = tanh_f32_func(xf);
-        return __float2half_rn(yf);
-    } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-        float f0 = __bfloat162float(__low2bfloat16(input));
-        float f1 = __bfloat162float(__high2bfloat16(input));
-        float r0 = tanh_f32_func(f0);
-        float r1 = tanh_f32_func(f1);
-        return __floats2bfloat162_rn(r0, r1);
-    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-        float xf = __bfloat162float(input);
-        float rf = tanh_f32_func(xf);
-        return __float2bfloat16_rn(rf);
-    } else if constexpr (std::is_same_v<T, float>) {
-        return tanh_f32_func(input);
-    } else if constexpr (std::is_same_v<T, double>) {
-        return std::tanh(input);
-    } else {
-        return std::tanh(input);
+    __device__ __forceinline__ float tanh_f32_func(float x) const {
+        return tanhf(x);
+    }
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = tanh_f32_func(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            float r0 = tanh_f32_func(f0);
+            float r1 = tanh_f32_func(f1);
+            return __floats2bfloat162_rn(r0, r1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            float rf = tanh_f32_func(xf);
+            return __float2bfloat16_rn(rf);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return tanh_f32_func(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::tanh(input);
+        } else {
+            return std::tanh(input);
+        }
     }
-  }
 } TanhOp;
 } // namespace op::tanh::cuda
 
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
index eeb6c85bf..a2c36551c 100644
--- a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
     const auto &input_desc = input_desc_vec.at(0);
     const auto &output_shape = out_desc->shape();
     const auto &input_shape = input_desc->shape();
-    
+
     CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
 
     CHECK_SAME_SHAPE(output_shape, input_shape);
diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc
index a5ed56f74..d34d97df6 100644
--- a/src/infiniop/ops/tanh/operator.cc
+++ b/src/infiniop/ops/tanh/operator.cc
@@ -20,11 +20,11 @@ __C infiniStatus_t infiniopCreateTanhDescriptor(
 
 #define CREATE(CASE, NAMESPACE)                                             \
     case CASE:                                                              \
-        return op::tanh::NAMESPACE::Descriptor::create(                      \
+        return op::tanh::NAMESPACE::Descriptor::create(                     \
             handle,                                                         \
-            reinterpret_cast<op::tanh::NAMESPACE::Descriptor **>(desc_ptr),  \
+            reinterpret_cast<op::tanh::NAMESPACE::Descriptor **>(desc_ptr), \
             output_desc,                                                    \
-            {input_desc})                                                   \
+            {input_desc})
 
     switch (handle->device) {
 
@@ -50,8 +50,8 @@ __C infiniStatus_t infiniopCreateTanhDescriptor(
 
 __C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
         *size = reinterpret_cast<op::tanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -84,8 +84,8 @@ __C infiniStatus_t infiniopTanh(
     const void *input,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
         return reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, output, {input}, stream)
 
@@ -114,9 +114,9 @@ __C infiniStatus_t infiniopTanh(
 __C infiniStatus_t
 infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-    delete reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc); \
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc
new file mode 100644
index 000000000..de7e86e3e
--- /dev/null
+++ b/src/infiniop/ops/where/cpu/where_cpu.cc
@@ -0,0 +1,84 @@
+#include "where_cpu.h"
+
+namespace op::where::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<WhereOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<WhereOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<WhereOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<WhereOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<WhereOp, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<WhereOp, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<WhereOp, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<WhereOp, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<WhereOp, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<WhereOp, uint16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<WhereOp, uint32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<WhereOp, uint64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<WhereOp, bool>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::cpu
diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h
new file mode 100644
index 000000000..02ccab234
--- /dev/null
+++ b/src/infiniop/ops/where/cpu/where_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __WHERE_CPU_H__
+#define __WHERE_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(where, cpu)
+
+namespace op::where::cpu {
+typedef struct WhereOp {
+public:
+    static constexpr size_t num_inputs = 3;
+    template <typename T>
+    T operator()(const T &a, const T &b, const T &cond) const {
+        return cond ? a : b;
+    }
+} WhereOp;
+} // namespace op::where::cpu
+
+#endif // __WHERE_CPU_H__
diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh
new file mode 100644
index 000000000..58e370aa4
--- /dev/null
+++ b/src/infiniop/ops/where/cuda/kernel.cuh
@@ -0,0 +1,15 @@
+#ifndef __WHERE_CUDA_H__
+#define __WHERE_CUDA_H__
+
+namespace op::where::cuda {
+typedef struct WhereOp {
+public:
+    static constexpr size_t num_inputs = 3;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b, const T &cond) const {
+        return cond ? a : b;
+    }
+} WhereOp;
+} // namespace op::where::cuda
+
+#endif // __WHERE_CUDA_H__
diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h
new file mode 100644
index 000000000..43bb1a945
--- /dev/null
+++ b/src/infiniop/ops/where/metax/where_metax.h
@@ -0,0 +1,8 @@
+#ifndef __WHERE_METAX_API_H__
+#define __WHERE_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(where, metax)
+
+#endif // __WHERE_METAX_API_H__
diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca
new file mode 100644
index 000000000..fb4be9325
--- /dev/null
+++ b/src/infiniop/ops/where/metax/where_metax.maca
@@ -0,0 +1,62 @@
+#include "where_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::where::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::metax
diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu
new file mode 100644
index 000000000..860089bd2
--- /dev/null
+++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu
@@ -0,0 +1,91 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "where_nvidia.cuh"
+
+namespace op::where::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::nvidia
diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh
new file mode 100644
index 000000000..c168364a8
--- /dev/null
+++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __WHERE_CUDA_API_H__
+#define __WHERE_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(where, nvidia)
+
+#endif // __WHERE_CUDA_API_H__
diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc
new file mode 100644
index 000000000..d69b1d4e1
--- /dev/null
+++ b/src/infiniop/ops/where/operator.cc
@@ -0,0 +1,148 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/where.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/where_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/where_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/where_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateWhereDescriptor(
+    infiniopHandle_t handle,
+    infiniopWhereDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t condition_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::where::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::where::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                          \
+            {a_desc,                                                         \
+             b_desc,                                                         \
+             condition_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::where::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopWhere(
+    infiniopWhereDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    const void *condition,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::where::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b, condition}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::where::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/where.py b/test/infiniop/where.py
new file mode 100644
index 000000000..c940d4f05
--- /dev/null
+++ b/test/infiniop/where.py
@@ -0,0 +1,288 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+    to_torch_dtype,
+    torch_device_map,
+)
+from enum import Enum, auto
+
+# ======================================================================
+# Configuration (Internal Use Only)
+# Now each test case tuple is: (shape, a_stride, b_stride, cond_stride, c_stride)
+# ======================================================================
+_TEST_CASES_ = [
+    ((13, 4), None, None, None, None),
+    ((13, 4), None, None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None, None),
+    ((13, 4, 4), None, None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None, None),
+    ((16, 5632), None, None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_COND = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+    Inplace.INPLACE_COND,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+_INTEGER_DTYPES = [
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.U32,
+    InfiniDtype.U64,
+]
+
+_FLOAT_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+    InfiniDtype.BF16,
+]
+
+_TENSOR_DTYPES = _INTEGER_DTYPES + _FLOAT_DTYPES
+
+_TOLERANCE_MAP = {
+    InfiniDtype.I32: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.I64: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.U32: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.U64: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+def is_supported_dt(inf_dt):
+    try:
+        td = to_torch_dtype(inf_dt, compatability_mode=True)
+        _ = torch.empty((1,), dtype=td, device="cpu")
+        return True
+    except Exception:
+        return False
+
+def _is_integer_dtype(inf_dt):
+    return inf_dt in _INTEGER_DTYPES
+
+def _is_unsigned_dtype(inf_dt):
+    return inf_dt in (InfiniDtype.U32, InfiniDtype.U64)
+
+
+def make_integer_torch_tensor(shape, inf_dt, device):
+    use_compatibility = _is_unsigned_dtype(inf_dt)
+
+    if inf_dt == InfiniDtype.I32:
+        low, high, dtype = -2000, 2000, torch.int32
+    elif inf_dt == InfiniDtype.I64:
+        low, high, dtype = -2048, 2048, torch.int64
+    elif inf_dt == InfiniDtype.U32:
+        low, high, dtype = 0, 2000, torch.int32
+    elif inf_dt == InfiniDtype.U64:
+        low, high, dtype = 0, 2048, torch.int64
+    else:
+        low, high, dtype = 0, 1, torch.int64
+
+    dev = torch_device_map[device]
+
+    t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev)
+
+    target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility)
+    if t.dtype != target_torch_dt:
+        t = t.to(dtype=target_torch_dt)
+
+    return t
+
+def where_ref(c, a, b, cond):
+    cond_bool = cond.torch_tensor().to(torch.bool)
+    c.torch_tensor().copy_(torch.where(cond_bool, a.torch_tensor(), b.torch_tensor()))
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    cond_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    inf_dt = dtype
+
+    if not is_supported_dt(inf_dt):
+        # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on this platform")
+        return
+
+    try:
+        if _is_integer_dtype(inf_dt):
+            a_torch = make_integer_torch_tensor(shape, inf_dt, device)
+            b_torch = make_integer_torch_tensor(shape, inf_dt, device)
+            a = TestTensor.from_torch(a_torch, inf_dt, device)
+            b = TestTensor.from_torch(b_torch, inf_dt, device)
+        else:
+            a = TestTensor(shape, a_stride, inf_dt, device, mode="random")
+            b = TestTensor(shape, b_stride, inf_dt, device, mode="random")
+    except RuntimeError as e:
+        msg = str(e)
+        if "not implemented for 'UInt32'" in msg or "not implemented for 'UInt64'" in msg or "check_uniform_bounds" in msg:
+            # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} because platform torch can't build random tensor: {e}")
+            return
+        else:
+            raise
+
+    dev = torch_device_map[device]
+    if _is_integer_dtype(inf_dt):
+        cond_torch = torch.randint(0, 2, size=shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev)
+    else:
+        cond_bool = (torch.rand(shape, device=dev) > 0.5)
+        cond_torch = cond_bool.to(dtype=to_torch_dtype(inf_dt, compatability_mode=False))
+
+    cond = TestTensor.from_torch(cond_torch, inf_dt, device)
+
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    elif inplace == Inplace.INPLACE_COND:
+        if c_stride != cond_stride:
+            return
+        c = cond    
+    else:
+        if _is_integer_dtype(inf_dt):
+            dev = torch_device_map[device]
+            c_torch = torch.zeros(shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev)
+            c = TestTensor.from_torch(c_torch, inf_dt, device)
+        else:
+            c = TestTensor(shape, c_stride, inf_dt, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Where on {InfiniDeviceNames[device]} "
+        f"shape:{shape} a_stride:{a_stride} b_stride:{b_stride} cond_stride:{cond_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[inf_dt]} inplace:{inplace}"
+    )
+
+    where_ref(c, a, b, cond)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    try:
+        check_error(
+            LIBINFINIOP.infiniopCreateWhereDescriptor(
+                handle,
+                ctypes.byref(descriptor),
+                c.descriptor,
+                a.descriptor,
+                b.descriptor,
+                cond.descriptor,
+            )
+        )
+    except Exception as e:
+        # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on {InfiniDeviceNames[device]}: CreateWhereDescriptor failed: {e}")
+        return
+
+    for tensor in [a, b, c, cond]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetWhereWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_where():
+        check_error(
+            LIBINFINIOP.infiniopWhere(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                cond.data(),
+                None,
+            )
+        )
+
+    lib_where()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, inf_dt)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation("PyTorch", lambda: where_ref(c, a, b, cond), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_where(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+    check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor))
+
+
+def main():
+    args = get_args()
+    global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    supported = [dt for dt in _TENSOR_DTYPES if is_supported_dt(dt)]
+    devices = get_test_devices(args)
+
+    for device in devices:
+        test_operator(device, test, _TEST_CASES, supported)
+
+    print("\033[92mTest passed!\033[0m")
+
+
+if __name__ == "__main__":
+    main()

From a126be00f80873ee02b11db78663aa96e8cc3720 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Mon, 18 Aug 2025 00:55:58 +0800
Subject: [PATCH 11/16] [T1-1-1]: operators clang-format

---
 include/infiniop.h                      | 14 ++++----
 include/infiniop/ops/cast.h             | 16 ++++-----
 include/infiniop/ops/hardswish.h        | 16 ++++-----
 include/infiniop/ops/leakyrelu.h        | 18 +++++-----
 include/infiniop/ops/sigmoid_backward.h | 20 +++++------
 include/infiniop/ops/tanh.h             | 16 ++++-----
 include/infiniop/ops/where.h            | 24 +++++++-------
 src/infiniop-test/include/ops.hpp       | 44 ++++++++++++-------------
 8 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/include/infiniop.h b/include/infiniop.h
index c86127cb2..30a07e4b4 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -4,26 +4,26 @@
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/attention.h"
+#include "infiniop/ops/cast.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/cos.h"
+#include "infiniop/ops/exp.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/hardswish.h"
+#include "infiniop/ops/leakyrelu.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
+#include "infiniop/ops/sigmoid_backward.h"
+#include "infiniop/ops/sin.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
-#include "infiniop/ops/exp.h"
-#include "infiniop/ops/sin.h"
-#include "infiniop/ops/cos.h"
-#include "infiniop/ops/leakyrelu.h"
 #include "infiniop/ops/tanh.h"
-#include "infiniop/ops/sigmoid_backward.h"
-#include "infiniop/ops/hardswish.h"
-#include "infiniop/ops/cast.h"
 #include "infiniop/ops/where.h"
 #include "infiniop/tensor_descriptor.h"
 
diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h
index 82b41490e..81d771efe 100644
--- a/include/infiniop/ops/cast.h
+++ b/include/infiniop/ops/cast.h
@@ -6,18 +6,18 @@
 typedef struct InfiniopDescriptor *infiniopCastDescriptor_t;
 
 __C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle,
-                                                        infiniopCastDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t output,
-                                                        infiniopTensorDescriptor_t input);
+                                                         infiniopCastDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
 
 __C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size);
 
 __C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *output,
-                                        const void *input,
-                                        void *stream);
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
 
 __C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc);
 
diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
index 79a7c93ea..8d655fe82 100644
--- a/include/infiniop/ops/hardswish.h
+++ b/include/infiniop/ops/hardswish.h
@@ -6,18 +6,18 @@
 typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t;
 
 __C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle,
-                                                        infiniopHardswishDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t output,
-                                                        infiniopTensorDescriptor_t input);
+                                                              infiniopHardswishDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input);
 
 __C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size);
 
 __C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *output,
-                                        const void *input,
-                                        void *stream);
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
 
 __C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc);
 
diff --git a/include/infiniop/ops/leakyrelu.h b/include/infiniop/ops/leakyrelu.h
index 9ce93d53c..adc46d1c6 100644
--- a/include/infiniop/ops/leakyrelu.h
+++ b/include/infiniop/ops/leakyrelu.h
@@ -6,19 +6,19 @@
 typedef struct InfiniopDescriptor *infiniopLeakyreluDescriptor_t;
 
 __C __export infiniStatus_t infiniopCreateLeakyreluDescriptor(infiniopHandle_t handle,
-                                                        infiniopLeakyreluDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t output,
-                                                        infiniopTensorDescriptor_t input,
-                                                        float negative_slope);
+                                                              infiniopLeakyreluDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input,
+                                                              float negative_slope);
 
 __C __export infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size);
 
 __C __export infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *output,
-                                        const void *input,
-                                        void *stream);
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
 
 __C __export infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc);
 
diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h
index 2bcc5dee6..abab0cde7 100644
--- a/include/infiniop/ops/sigmoid_backward.h
+++ b/include/infiniop/ops/sigmoid_backward.h
@@ -6,20 +6,20 @@
 typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t;
 
 __C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle,
-                                                        infiniopSigmoidBackwardDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t grad_input,
-                                                        infiniopTensorDescriptor_t input,
-                                                        infiniopTensorDescriptor_t grad_output);
+                                                                    infiniopSigmoidBackwardDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t grad_input,
+                                                                    infiniopTensorDescriptor_t input,
+                                                                    infiniopTensorDescriptor_t grad_output);
 
 __C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size);
 
 __C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *grad_input,
-                                        const void *input,
-                                        const void* grad_output,
-                                        void *stream);
+                                                    void *workspace,
+                                                    size_t workspace_size,
+                                                    void *grad_input,
+                                                    const void *input,
+                                                    const void *grad_output,
+                                                    void *stream);
 
 __C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc);
 
diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h
index 62974e951..742dba860 100644
--- a/include/infiniop/ops/tanh.h
+++ b/include/infiniop/ops/tanh.h
@@ -6,18 +6,18 @@
 typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t;
 
 __C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle,
-                                                        infiniopTanhDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t output,
-                                                        infiniopTensorDescriptor_t input);
+                                                         infiniopTanhDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
 
 __C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size);
 
 __C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *output,
-                                        const void *input,
-                                        void *stream);
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
 
 __C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc);
 
diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h
index a328c312a..713db102f 100644
--- a/include/infiniop/ops/where.h
+++ b/include/infiniop/ops/where.h
@@ -6,22 +6,22 @@
 typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t;
 
 __C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
-                                                        infiniopWhereDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b,
-                                                        infiniopTensorDescriptor_t condition);
+                                                          infiniopWhereDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b,
+                                                          infiniopTensorDescriptor_t condition);
 
 __C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size);
 
 __C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        const void *condition,
-                                        void *stream);
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *c,
+                                          const void *a,
+                                          const void *b,
+                                          const void *condition,
+                                          void *stream);
 
 __C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
 
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 4c16eeec7..c3a120703 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -39,28 +39,28 @@ DECLARE_INFINIOP_TEST(where)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                  \
-    {                                          \
-        REGISTER_INFINIOP_TEST(gemm)           \
-        REGISTER_INFINIOP_TEST(random_sample)  \
-        REGISTER_INFINIOP_TEST(add)            \
-        REGISTER_INFINIOP_TEST(mul)            \
-        REGISTER_INFINIOP_TEST(clip)           \
-        REGISTER_INFINIOP_TEST(swiglu)         \
-        REGISTER_INFINIOP_TEST(rope)           \
-        REGISTER_INFINIOP_TEST(rms_norm)       \
-        REGISTER_INFINIOP_TEST(causal_softmax) \
-        REGISTER_INFINIOP_TEST(rearrange)      \
-        REGISTER_INFINIOP_TEST(sub)            \
-        REGISTER_INFINIOP_TEST(exp)            \
-        REGISTER_INFINIOP_TEST(sin)            \
-        REGISTER_INFINIOP_TEST(cos)            \
-        REGISTER_INFINIOP_TEST(leakyrelu)      \
-        REGISTER_INFINIOP_TEST(tanh)           \
-        REGISTER_INFINIOP_TEST(sigmoid_backward)\
-        REGISTER_INFINIOP_TEST(hardswish)      \
-        REGISTER_INFINIOP_TEST(cast)           \
-        REGISTER_INFINIOP_TEST(where)          \
+#define TEST_BUILDER_MAPPINGS                    \
+    {                                            \
+        REGISTER_INFINIOP_TEST(gemm)             \
+        REGISTER_INFINIOP_TEST(random_sample)    \
+        REGISTER_INFINIOP_TEST(add)              \
+        REGISTER_INFINIOP_TEST(mul)              \
+        REGISTER_INFINIOP_TEST(clip)             \
+        REGISTER_INFINIOP_TEST(swiglu)           \
+        REGISTER_INFINIOP_TEST(rope)             \
+        REGISTER_INFINIOP_TEST(rms_norm)         \
+        REGISTER_INFINIOP_TEST(causal_softmax)   \
+        REGISTER_INFINIOP_TEST(rearrange)        \
+        REGISTER_INFINIOP_TEST(sub)              \
+        REGISTER_INFINIOP_TEST(exp)              \
+        REGISTER_INFINIOP_TEST(sin)              \
+        REGISTER_INFINIOP_TEST(cos)              \
+        REGISTER_INFINIOP_TEST(leakyrelu)        \
+        REGISTER_INFINIOP_TEST(tanh)             \
+        REGISTER_INFINIOP_TEST(sigmoid_backward) \
+        REGISTER_INFINIOP_TEST(hardswish)        \
+        REGISTER_INFINIOP_TEST(cast)             \
+        REGISTER_INFINIOP_TEST(where)            \
     }
 
 namespace infiniop_test {

From 92b15d0a2fe0a5e1920b62fbebe31683d8fd8a26 Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Sun, 24 Aug 2025 18:07:53 +0800
Subject: [PATCH 12/16] [T1-1-1]: Modify where operator condition with T->bool

---
 src/infiniop/ops/where/cpu/where_cpu.h | 2 +-
 src/infiniop/ops/where/cuda/kernel.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h
index 02ccab234..3d86cb4f7 100644
--- a/src/infiniop/ops/where/cpu/where_cpu.h
+++ b/src/infiniop/ops/where/cpu/where_cpu.h
@@ -10,7 +10,7 @@ typedef struct WhereOp {
 public:
     static constexpr size_t num_inputs = 3;
     template <typename T>
-    T operator()(const T &a, const T &b, const T &cond) const {
+    T operator()(const T &a, const T &b, const bool &cond) const {
         return cond ? a : b;
     }
 } WhereOp;
diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh
index 58e370aa4..8eb5c762b 100644
--- a/src/infiniop/ops/where/cuda/kernel.cuh
+++ b/src/infiniop/ops/where/cuda/kernel.cuh
@@ -6,7 +6,7 @@ typedef struct WhereOp {
 public:
     static constexpr size_t num_inputs = 3;
     template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b, const T &cond) const {
+    __device__ __forceinline__ T operator()(const T &a, const T &b, const bool &cond) const {
         return cond ? a : b;
     }
 } WhereOp;

From 73a6994cde5f6db360cb4962f6eae1d5201c9d4b Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Wed, 27 Aug 2025 16:45:32 +0800
Subject: [PATCH 13/16] [T1-1-1]: Modify leakyrelu operator profile test

---
 test/infiniop/leakyrelu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/infiniop/leakyrelu.py b/test/infiniop/leakyrelu.py
index 93a8170d2..76562ddf0 100644
--- a/test/infiniop/leakyrelu.py
+++ b/test/infiniop/leakyrelu.py
@@ -147,7 +147,7 @@ def lib_leakyrelu():
     # Profiling workflow
     if PROFILE:
         # fmt: off
-        profile_operation("PyTorch", lambda: leakyrelu(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: leakyrelu(output.torch_tensor(), input.torch_tensor(), negative_slope), device, NUM_PRERUN, NUM_ITERATIONS)
         profile_operation("    lib", lambda: lib_leakyrelu(), device, NUM_PRERUN, NUM_ITERATIONS)
         # fmt: on
     check_error(LIBINFINIOP.infiniopDestroyLeakyreluDescriptor(descriptor))

From 52f6d162fb9f98ec3abf10767809361a1dbdc76d Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Wed, 27 Aug 2025 20:30:01 +0800
Subject: [PATCH 14/16] [T1-1-1]: Modify where operator metax

---
 src/infiniop/ops/where/metax/where_metax.maca | 38 +++++++++++++++++--
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca
index fb4be9325..46c47e541 100644
--- a/src/infiniop/ops/where/metax/where_metax.maca
+++ b/src/infiniop/ops/where/metax/where_metax.maca
@@ -19,16 +19,28 @@ infiniStatus_t Descriptor::create(
 
     const auto &a_desc = input_desc_vec.at(0);
     const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
     const auto &c_shape = out_desc->shape();
     const auto &a_shape = a_desc->shape();
     const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
 
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
 
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
 
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
 
     return INFINI_STATUS_SUCCESS;
 }
@@ -53,6 +65,24 @@ infiniStatus_t Descriptor::calculate(
         return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
         return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }

From 394fffb29b668f7a91e9a5b1f1818a492487132b Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Wed, 27 Aug 2025 20:31:37 +0800
Subject: [PATCH 15/16] [T1-1-1]: Modify tanh operator cpp

---
 src/infiniop-test/src/ops/tanh.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp
index 6aeb3c301..6f966de09 100644
--- a/src/infiniop-test/src/ops/tanh.cpp
+++ b/src/infiniop-test/src/ops/tanh.cpp
@@ -37,8 +37,8 @@ std::shared_ptr<Test> Test::build(
         test->_atol = 1e-3;
     }
     if (elemType == GGML_TYPE_F32) {
-        test->_rtol = 1e-7;
-        test->_atol = 1e-7;
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
     }
 
     return test;

From 10817ced9d0a9c0a6b2b0bffce8257a91f40e35b Mon Sep 17 00:00:00 2001
From: PPPoint <1024879159@qq.com>
Date: Wed, 27 Aug 2025 20:50:44 +0800
Subject: [PATCH 16/16] [T1-1-1]: Modify where operator metax

---
 src/infiniop/ops/where/metax/where_metax.maca | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca
index 46c47e541..b648cfbcc 100644
--- a/src/infiniop/ops/where/metax/where_metax.maca
+++ b/src/infiniop/ops/where/metax/where_metax.maca
@@ -40,7 +40,7 @@ infiniStatus_t Descriptor::create(
 
     CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
 
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
 
     return INFINI_STATUS_SUCCESS;
 }