diff --git a/include/infiniop.h b/include/infiniop.h
index d51b8d92e..df8ecc896 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -7,13 +7,22 @@
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/crossentropyloss_backward.h"
+#include "infiniop/ops/div.h"
+#include "infiniop/ops/gelu.h"
+#include "infiniop/ops/gelu_backward.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/logical_and.h"
+#include "infiniop/ops/logical_equal.h"
+#include "infiniop/ops/logical_or.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"
+#include "infiniop/ops/relu_backward.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
+#include "infiniop/ops/silu.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/tensor_descriptor.h"
diff --git a/include/infiniop/ops/crossentropyloss_backward.h b/include/infiniop/ops/crossentropyloss_backward.h
new file mode 100644
index 000000000..87969bc87
--- /dev/null
+++ b/include/infiniop/ops/crossentropyloss_backward.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_CROSSENTROPYLOSS_BACKWARD_API_H__
+#define __INFINIOP_CROSSENTROPYLOSS_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCrossEntropyLossBackWardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCrossEntropyLossBackWardDescriptor(infiniopHandle_t handle,
+                                                                             infiniopCrossEntropyLossBackWardDescriptor_t *desc_ptr,
+                                                                             infiniopTensorDescriptor_t grad_logits,
+                                                                             infiniopTensorDescriptor_t probs,
+                                                                             infiniopTensorDescriptor_t target);
+
+__C __export infiniStatus_t infiniopGetCrossEntropyLossBackWardWorkspaceSize(infiniopCrossEntropyLossBackWardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCrossEntropyLossBackWard(infiniopCrossEntropyLossBackWardDescriptor_t desc,
+                                                             void *workspace,
+                                                             size_t workspace_size,
+                                                             void *grad_logits,
+                                                             const void *probs,
+                                                             const void *target,
+                                                             void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCrossEntropyLossBackWardDescriptor(infiniopCrossEntropyLossBackWardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h
new file mode 100644
index 000000000..235a34a15
--- /dev/null
+++ b/include/infiniop/ops/div.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_DIV_API_H__
+#define __INFINIOP_DIV_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDivDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
+                                                        infiniopDivDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *mode,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/gelu.h b/include/infiniop/ops/gelu.h
new file mode 100644
index 000000000..061bab14d
--- /dev/null
+++ b/include/infiniop/ops/gelu.h
@@ -0,0 +1,22 @@
+#ifndef __INFINIOP_GELU_API_H__
+#define __INFINIOP_GELU_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopGeluDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateGeluDescriptor(infiniopHandle_t handle,
+                                                         infiniopGeluDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGelu(infiniopGeluDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/gelu_backward.h b/include/infiniop/ops/gelu_backward.h
new file mode 100644
index 000000000..2cb7cbf46
--- /dev/null
+++ b/include/infiniop/ops/gelu_backward.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_GELU_BACKWARD_API_H__
+#define __INFINIOP_GELU_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopGeluBackWardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateGeluBackWardDescriptor(infiniopHandle_t handle,
+                                                                 infiniopGeluBackWardDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t grad_input,
+                                                                 infiniopTensorDescriptor_t input,
+                                                                 infiniopTensorDescriptor_t grad_output);
+
+__C __export infiniStatus_t infiniopGeluBackWard(infiniopGeluBackWardDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *grad_input,
+                                                 const void *input,
+                                                 const void *grad_output,
+                                                 void *stream);
+
+__C __export infiniStatus_t infiniopDestroyGeluBackWardDescriptor(infiniopGeluBackWardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/logical_and.h b/include/infiniop/ops/logical_and.h
new file mode 100644
index 000000000..37cbf04d4
--- /dev/null
+++ b/include/infiniop/ops/logical_and.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_AND_API_H__
+#define __INFINIOP_AND_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogicalAndDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogicalAndDescriptor(infiniopHandle_t handel,
+                                                               infiniopLogicalAndDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t c,
+                                                               infiniopTensorDescriptor_t a,
+                                                               infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLogicalAnd(infiniopLogicalAndDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *c,
+                                               const void *a,
+                                               const void *b,
+                                               void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc);
+#endif
diff --git a/include/infiniop/ops/logical_equal.h b/include/infiniop/ops/logical_equal.h
new file mode 100644
index 000000000..e952f1dbc
--- /dev/null
+++ b/include/infiniop/ops/logical_equal.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_EQUAL_API_H__
+#define __INFINIOP_EQUAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogicalEqualDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogicalEqualDescriptor(infiniopHandle_t handel,
+                                                                 infiniopLogicalEqualDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t c,
+                                                                 infiniopTensorDescriptor_t a,
+                                                                 infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetLogicalEqualWorkspaceSize(infiniopLogicalEqualDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLogicalEqual(infiniopLogicalEqualDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *c,
+                                                 const void *a,
+                                                 const void *b,
+                                                 void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogicalEqualDescriptor(infiniopLogicalEqualDescriptor_t desc);
+#endif
diff --git a/include/infiniop/ops/logical_or.h b/include/infiniop/ops/logical_or.h
new file mode 100644
index 000000000..efe19a1c1
--- /dev/null
+++ b/include/infiniop/ops/logical_or.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_OR_API_H__
+#define __INFINIOP_OR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogicalOrDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogicalOrDescriptor(infiniopHandle_t handel,
+                                                              infiniopLogicalOrDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t c,
+                                                              infiniopTensorDescriptor_t a,
+                                                              infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLogicalOr(infiniopLogicalOrDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *c,
+                                              const void *a,
+                                              const void *b,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc);
+#endif
diff --git a/include/infiniop/ops/relu_backward.h b/include/infiniop/ops/relu_backward.h
new file mode 100644
index 000000000..3802da6b4
--- /dev/null
+++ b/include/infiniop/ops/relu_backward.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_RELU_BACKWARD_API_H__
+#define __INFINIOP_RELU_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReluBackWardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReluBackWardDescriptor(infiniopHandle_t handle,
+                                                                 infiniopReluBackWardDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t grad_input,
+                                                                 infiniopTensorDescriptor_t input,
+                                                                 infiniopTensorDescriptor_t grad_output);
+
+__C __export infiniStatus_t infiniopGetReluBackWardWorkspaceSize(infiniopReluBackWardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReluBackWard(infiniopReluBackWardDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *grad_input,
+                                                 const void *input,
+                                                 const void *grad_output,
+                                                 void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReluBackWardDescriptor(infiniopReluBackWardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/silu.h b/include/infiniop/ops/silu.h
new file mode 100644
index 000000000..66a82ad0d
--- /dev/null
+++ b/include/infiniop/ops/silu.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SILU_API_H__
+#define __INFINIOP_SILU_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSiluDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSiluDescriptor(infiniopHandle_t handle,
+                                                         infiniopSiluDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y,
+                                                         infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSilu(infiniopSiluDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *y,
+                                         const void *x,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index eb2d4319e..827a660b9 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -12,18 +12,27 @@
 def run_tests(args):
     failed = []
     for test in [
-        "add.py",
-        "attention.py",
-        "causal_softmax.py",
-        "clip.py",
-        "gemm.py",
-        "mul.py",
-        "random_sample.py",
-        "rearrange.py",
-        "rms_norm.py",
-        "rope.py",
-        "sub.py",
-        "swiglu.py",
+        # "add.py",
+        # "attention.py",
+        # "causal_softmax.py",
+        # "clip.py",
+        # "gemm.py",
+        # "mul.py",
+        # "random_sample.py",
+        # "rearrange.py",
+        # "rms_norm.py",
+        # "rope.py",
+        # "sub.py",
+        # "swiglu.py",
+        "silu.py",
+        "div.py",
+        "logical_and.py",
+        "logical_or.py",
+        "logical_equal.py",
+        "relu_backward.py",
+        "gelu.py",
+        "gelu_backward.py",
+        "crossentropyloss_backward.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..11fdbb34d 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,15 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(silu)
+DECLARE_INFINIOP_TEST(div)
+DECLARE_INFINIOP_TEST(logical_and)
+DECLARE_INFINIOP_TEST(logical_or)
+DECLARE_INFINIOP_TEST(logical_equal)
+DECLARE_INFINIOP_TEST(gelu)
+DECLARE_INFINIOP_TEST(gelu_backward)
+DECLARE_INFINIOP_TEST(relu_backward)
+DECLARE_INFINIOP_TEST(crossentropyloss_backward)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -30,19 +39,28 @@ DECLARE_INFINIOP_TEST(sub)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                  \
-    {                                          \
-        REGISTER_INFINIOP_TEST(gemm)           \
-        REGISTER_INFINIOP_TEST(random_sample)  \
-        REGISTER_INFINIOP_TEST(add)            \
-        REGISTER_INFINIOP_TEST(mul)            \
-        REGISTER_INFINIOP_TEST(clip)           \
-        REGISTER_INFINIOP_TEST(swiglu)         \
-        REGISTER_INFINIOP_TEST(rope)           \
-        REGISTER_INFINIOP_TEST(rms_norm)       \
-        REGISTER_INFINIOP_TEST(causal_softmax) \
-        REGISTER_INFINIOP_TEST(rearrange)      \
-        REGISTER_INFINIOP_TEST(sub)            \
+#define TEST_BUILDER_MAPPINGS                          \
+    {                                                  \
+        REGISTER_INFINIOP_TEST(gemm)                   \
+        REGISTER_INFINIOP_TEST(random_sample)          \
+        REGISTER_INFINIOP_TEST(add)                    \
+        REGISTER_INFINIOP_TEST(mul)                    \
+        REGISTER_INFINIOP_TEST(clip)                   \
+        REGISTER_INFINIOP_TEST(swiglu)                 \
+        REGISTER_INFINIOP_TEST(rope)                   \
+        REGISTER_INFINIOP_TEST(rms_norm)               \
+        REGISTER_INFINIOP_TEST(causal_softmax)         \
+        REGISTER_INFINIOP_TEST(rearrange)              \
+        REGISTER_INFINIOP_TEST(sub)                    \
+        REGISTER_INFINIOP_TEST(silu)                   \
+        REGISTER_INFINIOP_TEST(div)                    \
+        REGISTER_INFINIOP_TEST(logical_and)            \
+        REGISTER_INFINIOP_TEST(logical_or)             \
+        REGISTER_INFINIOP_TEST(logical_equal)          \
+        REGISTER_INFINIOP_TEST(gelu)                   \
+        REGISTER_INFINIOP_TEST(gelu_backward)          \
+        REGISTER_INFINIOP_TEST(relu_backward)          \
+        REGISTER_INFINIOP_TEST(crossentropyloss_backward) \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/crossentropyloss_backward.cpp b/src/infiniop-test/src/ops/crossentropyloss_backward.cpp
new file mode 100644
index 000000000..fff99af2b
--- /dev/null
+++ b/src/infiniop-test/src/ops/crossentropyloss_backward.cpp
@@ -0,0 +1,112 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::crossentropyloss_backward {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> probs;
+    std::shared_ptr<Tensor> target;
+    std::shared_ptr<Tensor> grad_logits;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("probs") == tensors.end()
+        || tensors.find("target") == tensors.end()
+        || tensors.find("grad_logits") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->probs = tensors["probs"];
+    test->_attributes->target = tensors["target"];
+    test->_attributes->grad_logits = tensors["grad_logits"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopCrossEntropyLossBackWardDescriptor_t op_desc;
+    auto probs = _attributes->probs->to(device, device_id);
+    auto target = _attributes->target->to(device, device_id);
+    auto grad_logits = _attributes->grad_logits->to(device, device_id);
+    CHECK_OR(infiniopCreateCrossEntropyLossBackWardDescriptor(handle, &op_desc,
+                                                              grad_logits->desc(),
+                                                              probs->desc(),
+                                                              target->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create crossentropyloss_backward descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCrossEntropyLossBackWardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopCrossEntropyLossBackWard(op_desc, workspace, workspace_size,
+                                              grad_logits->data(),
+                                              probs->data(),
+                                              target->data(),
+                                              nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_logits, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopCrossEntropyLossBackWard(
+                op_desc, workspace, workspace_size,
+                grad_logits->data(),
+                probs->data(),
+                target->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyCrossEntropyLossBackWardDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"probs", "target", "grad_logits", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"grad_logits"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- probs: " << _attributes->probs->info() << std::endl;
+    oss << "- target: " << _attributes->target->info() << std::endl;
+    oss << "- grad_logits: " << _attributes->grad_logits->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::crossentropyloss_backward
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/div.cpp b/src/infiniop-test/src/ops/div.cpp
new file mode 100644
index 000000000..f026359b3
--- /dev/null
+++ b/src/infiniop-test/src/ops/div.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::div {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopDivDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateDivDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create div descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetDivWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopDiv(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         0,
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopDiv(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                0,
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyDivDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::div
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/gelu.cpp b/src/infiniop-test/src/ops/gelu.cpp
new file mode 100644
index 000000000..583576e66
--- /dev/null
+++ b/src/infiniop-test/src/ops/gelu.cpp
@@ -0,0 +1,102 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::gelu {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopGeluDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateGeluDescriptor(handle, &op_desc,
+                                          output->desc(),
+                                          input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create gelu descriptor."));
+    
+    // Note: GeLU doesn't have GetWorkspaceSize function, so we set workspace_size to 0
+    size_t workspace_size = 0;
+    void *workspace = nullptr;
+    
+    CHECK_OR(infiniopGelu(op_desc, workspace, workspace_size,
+                          output->data(),
+                          input->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopGelu(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyGeluDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::gelu
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/gelu_backward.cpp b/src/infiniop-test/src/ops/gelu_backward.cpp
new file mode 100644
index 000000000..f81435d88
--- /dev/null
+++ b/src/infiniop-test/src/ops/gelu_backward.cpp
@@ -0,0 +1,110 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::gelu_backward {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> grad_input;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("grad_output") == tensors.end()
+        || tensors.find("grad_input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->grad_input = tensors["grad_input"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopGeluBackWardDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto grad_input = _attributes->grad_input->to(device, device_id);
+    CHECK_OR(infiniopCreateGeluBackWardDescriptor(handle, &op_desc,
+                                                  grad_input->desc(),
+                                                  input->desc(),
+                                                  grad_output->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create gelu_backward descriptor."));
+    
+    // Note: GeLU Backward doesn't have GetWorkspaceSize function, so we set workspace_size to 0
+    size_t workspace_size = 0;
+    void *workspace = nullptr;
+    
+    CHECK_OR(infiniopGeluBackWard(op_desc, workspace, workspace_size,
+                                  grad_input->data(),
+                                  input->data(),
+                                  grad_output->data(),
+                                  nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_input, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopGeluBackWard(
+                op_desc, workspace, workspace_size,
+                grad_input->data(),
+                input->data(),
+                grad_output->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyGeluBackWardDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "grad_output", "grad_input", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"grad_input"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- grad_input: " << _attributes->grad_input->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::gelu_backward
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/logical_and.cpp b/src/infiniop-test/src/ops/logical_and.cpp
new file mode 100644
index 000000000..2854d4c9f
--- /dev/null
+++ b/src/infiniop-test/src/ops/logical_and.cpp
@@ -0,0 +1,112 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::logical_and {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLogicalAndDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateLogicalAndDescriptor(handle, &op_desc,
+                                                c->desc(),
+                                                a->desc(),
+                                                b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create logical_and descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLogicalAndWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLogicalAnd(op_desc, workspace, workspace_size,
+                                c->data(),
+                                a->data(),
+                                b->data(),
+                                nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allEqual(c, _attributes->ans);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLogicalAnd(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyLogicalAndDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::logical_and
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/logical_equal.cpp b/src/infiniop-test/src/ops/logical_equal.cpp
new file mode 100644
index 000000000..712fd7c09
--- /dev/null
+++ b/src/infiniop-test/src/ops/logical_equal.cpp
@@ -0,0 +1,112 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::logical_equal {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLogicalEqualDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateLogicalEqualDescriptor(handle, &op_desc,
+                                                  c->desc(),
+                                                  a->desc(),
+                                                  b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create logical_equal descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLogicalEqualWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLogicalEqual(op_desc, workspace, workspace_size,
+                                  c->data(),
+                                  a->data(),
+                                  b->data(),
+                                  nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allEqual(c, _attributes->ans);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLogicalEqual(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyLogicalEqualDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::logical_equal
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/logical_or.cpp b/src/infiniop-test/src/ops/logical_or.cpp
new file mode 100644
index 000000000..e3d8a8160
--- /dev/null
+++ b/src/infiniop-test/src/ops/logical_or.cpp
@@ -0,0 +1,112 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::logical_or {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLogicalOrDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateLogicalOrDescriptor(handle, &op_desc,
+                                               c->desc(),
+                                               a->desc(),
+                                               b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create logical_or descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLogicalOrWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLogicalOr(op_desc, workspace, workspace_size,
+                               c->data(),
+                               a->data(),
+                               b->data(),
+                               nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allEqual(c, _attributes->ans);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLogicalOr(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyLogicalOrDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::logical_or
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/relu_backward.cpp b/src/infiniop-test/src/ops/relu_backward.cpp
new file mode 100644
index 000000000..b3513ab3e
--- /dev/null
+++ b/src/infiniop-test/src/ops/relu_backward.cpp
@@ -0,0 +1,112 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::relu_backward {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> grad_input;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("grad_output") == tensors.end()
+        || tensors.find("grad_input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->grad_input = tensors["grad_input"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopReluBackWardDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto grad_input = _attributes->grad_input->to(device, device_id);
+    CHECK_OR(infiniopCreateReluBackWardDescriptor(handle, &op_desc,
+                                                  grad_input->desc(),
+                                                  input->desc(),
+                                                  grad_output->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create relu_backward descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetReluBackWardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopReluBackWard(op_desc, workspace, workspace_size,
+                                  grad_input->data(),
+                                  input->data(),
+                                  grad_output->data(),
+                                  nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_input, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopReluBackWard(
+                op_desc, workspace, workspace_size,
+                grad_input->data(),
+                input->data(),
+                grad_output->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroyReluBackWardDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "grad_output", "grad_input", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"grad_input"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- grad_input: " << _attributes->grad_input->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::relu_backward
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/silu.cpp b/src/infiniop-test/src/ops/silu.cpp
new file mode 100644
index 000000000..e641b52f3
--- /dev/null
+++ b/src/infiniop-test/src/ops/silu.cpp
@@ -0,0 +1,104 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::silu {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSiluDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateSiluDescriptor(handle, &op_desc,
+                                          output->desc(),
+                                          input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create silu descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSiluWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSilu(op_desc, workspace, workspace_size,
+                          output->data(),
+                          input->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSilu(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroySiluDescriptor(op_desc);
+    infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::silu
\ No newline at end of file
diff --git a/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.cc b/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.cc
new file mode 100644
index 000000000..eaf4a1656
--- /dev/null
+++ b/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.cc
@@ -0,0 +1,59 @@
+#include "crossentropyloss_backward_cpu.h"
+#include <functional>
+#include <iostream>
+#include <numeric>
+
+namespace op::CrossEntropyLossBackWard::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    auto shape = _info.getAllInputShapes();
+    auto dim = _info.getNdim();
+    const size_t N = std::accumulate(shape, shape + dim - 1, 1ull, std::multiplies<size_t>());
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CrossEntropyLossBackWardOp, fp16_t>(_info, output, inputs, stream, N);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CrossEntropyLossBackWardOp, float>(_info, output, inputs, stream, N);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<CrossEntropyLossBackWardOp, bf16_t>(_info, output, inputs, stream, N);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::CrossEntropyLossBackWard::cpu
diff --git a/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.h b/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.h
new file mode 100644
index 000000000..525e754c4
--- /dev/null
+++ b/src/infiniop/ops/crossentropyloss_backward/cpu/crossentropyloss_backward_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __CrossEntropyLossBackWard_CPU_H__
+#define __CrossEntropyLossBackWard_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(CrossEntropyLossBackWard, cpu)
+
+namespace op::CrossEntropyLossBackWard::cpu {
+typedef struct CrossEntropyLossBackWardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &probs, const T &target, const size_t N) const {
+        // Cross Entropy Loss Backward: grad_logits = (probs - target) / batch_size
+        return (probs - target) / static_cast<T>(N);
+    }
+} CrossEntropyLossBackWardOp;
+} // namespace op::CrossEntropyLossBackWard::cpu
+
+#endif // __CrossEntropyLossBackWard_CPU_H__
diff --git a/src/infiniop/ops/crossentropyloss_backward/cuda/kernel.cuh b/src/infiniop/ops/crossentropyloss_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..94b5ccec9
--- /dev/null
+++ b/src/infiniop/ops/crossentropyloss_backward/cuda/kernel.cuh
@@ -0,0 +1,25 @@
+#ifndef __CROSSENTROPYLOSSBACKWARD_CUDA_H__
+#define __CROSSENTROPYLOSSBACKWARD_CUDA_H__
+
+#include <cstddef>
+namespace op::CrossEntropyLossBackWard::cuda {
+typedef struct CrossEntropyLossBackWardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &probs, const T &target, const float N) const {
+        // Cross Entropy Loss Backward: grad_logits = (probs - target) / batch_size
+        if constexpr (std::is_same_v<T, half>) {
+            return __hdiv(__hsub(probs, target), __float2half(N));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __hdiv(__hsub(probs, target), __float2bfloat16(N));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fdiv_rn(__fsub_rn(probs, target), N);
+        } else {
+            return (probs - target) / N;
+        }
+    }
+} CrossEntropyLossBackWardOp;
+} // namespace op::CrossEntropyLossBackWard::cuda
+
+#endif // __CrossEntropyLossBackWard_CUDA_H__
diff --git a/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.h b/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.h
new file mode 100644
index 000000000..f3274a445
--- /dev/null
+++ b/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CROSSENTROPYLOSSBACKWARD_METAX_API_H__
+#define __CROSSENTROPYLOSSBACKWARD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(CrossEntropyLossBackWard, metax)
+
+#endif // __CrossEntropyLossBackWard_METAX_API_H__
diff --git a/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.maca b/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.maca
new file mode 100644
index 000000000..0c96a031b
--- /dev/null
+++ b/src/infiniop/ops/crossentropyloss_backward/metax/crossentropyloss_backward_metax.maca
@@ -0,0 +1,64 @@
+#include "crossentropyloss_backward_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::CrossEntropyLossBackWard::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32,  INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    auto shape = _info.getAllInputShapes();
+    auto dim = _info.getNdim();
+    size_t N = std::accumulate(shape, shape + dim - 1, 1ull, std::multiplies<size_t>());
+    float fN = static_cast<float>(N);
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, half>(_info, workspace, output, inputs, stream, std::move(N));
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, std::move(N));
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, float>(_info, workspace, output, inputs, stream, std::move(N));
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::CrossEntropyLossBackWard::metax
diff --git a/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cu b/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cu
new file mode 100644
index 000000000..40cca13a7
--- /dev/null
+++ b/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cu
@@ -0,0 +1,64 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "crossentropyloss_backward_nvidia.cuh"
+
+namespace op::CrossEntropyLossBackWard::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    auto shape = _info.getAllInputShapes();
+    auto dim = _info.getNdim();
+    size_t N = std::accumulate(shape, shape + dim - 1, 1ull, std::multiplies<size_t>());
+    float fN = static_cast<float>(N);
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, half>(_info, workspace, output, inputs, stream, std::move(fN));
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, std::move(fN));
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackWardOp, float>(_info, workspace, output, inputs, stream, std::move(fN));
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::CrossEntropyLossBackWard::nvidia
diff --git a/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cuh b/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cuh
new file mode 100644
index 000000000..f61edccb1
--- /dev/null
+++ b/src/infiniop/ops/crossentropyloss_backward/nvidia/crossentropyloss_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CROSSENTROPYLOSSBACKWARD_CUDA_API_H__
+#define __CROSSENTROPYLOSSBACKWARD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(CrossEntropyLossBackWard, nvidia)
+
+#endif // __CrossEntropyLossBackWard_CUDA_API_H__
diff --git a/src/infiniop/ops/crossentropyloss_backward/operator.cc b/src/infiniop/ops/crossentropyloss_backward/operator.cc
new file mode 100644
index 000000000..3ab974c7e
--- /dev/null
+++ b/src/infiniop/ops/crossentropyloss_backward/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/crossentropyloss_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/crossentropyloss_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/crossentropyloss_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/crossentropyloss_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCrossEntropyLossBackWardDescriptor(
+    infiniopHandle_t handle,
+    infiniopCrossEntropyLossBackWardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_logits_desc,
+    infiniopTensorDescriptor_t probs_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                                  \
+        return op::CrossEntropyLossBackWard::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                             \
+            reinterpret_cast<op::CrossEntropyLossBackWard::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_logits_desc,                                                                   \
+            {probs_desc,                                                                        \
+             target_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCrossEntropyLossBackWardWorkspaceSize(infiniopCrossEntropyLossBackWardDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                                    \
+    case CASE:                                                                                                  \
+        *size = reinterpret_cast<op::CrossEntropyLossBackWard::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCrossEntropyLossBackWard(
+    infiniopCrossEntropyLossBackWardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_logits,
+    const void *probs,
+    const void *target,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                                     \
+        return reinterpret_cast<const op::CrossEntropyLossBackWard::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_logits, {probs, target}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCrossEntropyLossBackWardDescriptor(infiniopCrossEntropyLossBackWardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                      \
+        delete reinterpret_cast<const op::CrossEntropyLossBackWard::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc
new file mode 100644
index 000000000..8702fb581
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.cc
@@ -0,0 +1,68 @@
+#include "div_cpu.h"
+#include "../div_mode.h"
+#include "infinicore.h"
+#include <iostream>
+
+namespace op::div::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    int mode,
+    void *stream) const {
+
+#define CALCULATE(OP)                                                              \
+    switch (_dtype) {                                                              \
+    case INFINI_DTYPE_BF16:                                                        \
+        return _device_info->calculate<OP, bf16_t>(_info, output, inputs, stream); \
+    case INFINI_DTYPE_F16:                                                         \
+        return _device_info->calculate<OP, fp16_t>(_info, output, inputs, stream); \
+    case INFINI_DTYPE_F32:                                                         \
+        return _device_info->calculate<OP, float>(_info, output, inputs, stream);  \
+    case INFINI_DTYPE_F64:                                                         \
+        return _device_info->calculate<OP, double>(_info, output, inputs, stream); \
+    default:                                                                       \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;                                     \
+    }
+
+    if (mode == MODE_DEFAULT) {
+        CALCULATE(DivOp)
+    } else if (mode == MODE_TRUNC) {
+        CALCULATE(DivOpTrunc)
+    } else if (mode == MODE_FLOOR) {
+        CALCULATE(DivOpFloor)
+    }
+
+#undef CALCULATE
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::cpu
diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h
new file mode 100644
index 000000000..5ee1d3d0c
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.h
@@ -0,0 +1,113 @@
+#ifndef __DIV_CPU_H__
+#define __DIV_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+// ELEMENTWISE_DESCRIPTOR(div, cpu)
+
+namespace op::div::cpu {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::cpu::DeviceImpl> _device_info;
+    size_t _workspace_size;
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::cpu::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        int mode,
+        void *stream) const;
+};
+
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a / b;
+    }
+} DivOp;
+typedef struct DivOpTrunc {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            float af = _f16_to_f32(a);
+            float bf = _f16_to_f32(b);
+            float res = std::trunc(af / bf);
+
+            return _f32_to_f16(float(res));
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            float af = _bf16_to_f32(a);
+            float bf = _bf16_to_f32(b);
+            float res = std::trunc(af / bf);
+            return _f32_to_bf16(res);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float res = std::trunc(a / b);
+            return res;
+        } else if constexpr (std::is_same_v<T, double>) {
+            double res = std::trunc(a / b);
+            return res;
+        } else {
+            return std::trunc(a / b);
+        }
+    }
+} DivOpTrunc;
+typedef struct DivOpFloor {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, fp16_t>) {
+            float af = _f16_to_f32(a);
+            float bf = _f16_to_f32(b);
+            float res = std::floor(af / bf);
+
+            return _f32_to_f16(float(res));
+        } else if constexpr (std::is_same_v<T, bf16_t>) {
+            float af = _bf16_to_f32(a);
+            float bf = _bf16_to_f32(b);
+            float res = std::floor(af / bf);
+
+            return _f32_to_bf16(res);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float res = std::floor(a / b);
+            return res;
+        } else if constexpr (std::is_same_v<T, double>) {
+            double res = std::floor(a / b);
+            return res;
+        } else {
+            return std::floor(a / b);
+        }
+    }
+} DivOpFloor;
+} // namespace op::div::cpu
+
+#endif // __DIV_CPU_H__
diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh
new file mode 100644
index 000000000..35d7582e4
--- /dev/null
+++ b/src/infiniop/ops/div/cuda/kernel.cuh
@@ -0,0 +1,70 @@
+#ifndef __DIV_CUDA_H__
+#define __DIV_CUDA_H__
+
+// #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+namespace op::div::cuda {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half>) {
+            return __hdiv(a, b);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __hdiv(a, b);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fdividef(a, b);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return fdivide(a, b);
+        } else {
+            return a / b;
+        }
+    }
+} DivOp;
+typedef struct DivOpTrunc {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half>) {
+            return htrunc(__hdiv(a, b));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return htrunc(__hdiv(a, b));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return truncf(fdividef(a, b));
+        } else if constexpr (std::is_same_v<T, double>) {
+            return trunc(fdivide(a, b));
+        } else {
+            return a / b;
+        }
+    }
+} DivOpTrunc;
+typedef struct DivOpFloor {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half>) {
+            float fa = __half2float(a);
+            float fb = __half2float(b);
+            float res = floorf(fdividef(fa, fb));
+            return __float2half(res);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float fa = __bfloat162float(a);
+            float fb = __bfloat162float(b);
+            float res = floorf(fdividef(fa, fb));
+            return __float2bfloat16(res);
+
+        } else if constexpr (std::is_same_v<T, float>) {
+            // return floorf(fdividef(a, b));
+            return floorf(a / b);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return floor(fdivide(a, b));
+        } else {
+            return a / b;
+        }
+    }
+} DivOpFloor;
+} // namespace op::div::cuda
+
+#endif // __DIV_CUDA_H__
diff --git a/src/infiniop/ops/div/div_mode.h b/src/infiniop/ops/div/div_mode.h
new file mode 100644
index 000000000..63a579d5b
--- /dev/null
+++ b/src/infiniop/ops/div/div_mode.h
@@ -0,0 +1,5 @@
+enum ModeType {
+    MODE_DEFAULT,
+    MODE_TRUNC,
+    MODE_FLOOR,
+};
diff --git a/src/infiniop/ops/div/metax/div_metax.h b/src/infiniop/ops/div/metax/div_metax.h
new file mode 100644
index 000000000..9ce65655c
--- /dev/null
+++ b/src/infiniop/ops/div/metax/div_metax.h
@@ -0,0 +1,45 @@
+#ifndef __DIV_METAX_API_H__
+#define __DIV_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+namespace op::div::metax {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
+    size_t _workspace_size;
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::metax::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        int mode,
+        void *stream) const;
+};
+} // namespace op::div::metax
+#endif // __DIV_METAX_API_H__
diff --git a/src/infiniop/ops/div/metax/div_metax.maca b/src/infiniop/ops/div/metax/div_metax.maca
new file mode 100644
index 000000000..9882f35a5
--- /dev/null
+++ b/src/infiniop/ops/div/metax/div_metax.maca
@@ -0,0 +1,72 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "../div_mode.h"
+#include "div_metax.h"
+
+namespace op::div::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    std::vector<infiniopTensorDescriptor_t> input_descs) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto &b_desc = input_descs.at(1);
+    const auto &c_shape = output_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create metax elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    int mode,
+    void *stream) const {
+#define CALCULATE(OP)                                                                                           \
+    switch (_dtype) {                                                                                           \
+    case INFINI_DTYPE_F16:                                                                                      \
+        return _device_info->calculate<256, cuda::OP, half>(_info, workspace, output, inputs, stream);          \
+    case INFINI_DTYPE_BF16:                                                                                     \
+        return _device_info->calculate<256, cuda::OP, cuda_bfloat16>(_info, workspace, output, inputs, stream); \
+    case INFINI_DTYPE_F32:                                                                                      \
+        return _device_info->calculate<256, cuda::OP, float>(_info, workspace, output, inputs, stream);         \
+    case INFINI_DTYPE_F64:                                                                                      \
+        return _device_info->calculate<256, cuda::OP, double>(_info, workspace, output, inputs, stream);        \
+    default:                                                                                                    \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;                                                                  \
+    }
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    if (mode == MODE_DEFAULT) {
+        CALCULATE(DivOp)
+    } else if (mode == MODE_TRUNC) {
+        CALCULATE(DivOpTrunc)
+    } else if (mode == MODE_FLOOR) {
+        CALCULATE(DivOpFloor)
+    }
+    return INFINI_STATUS_SUCCESS;
+
+  
+}
+} // namespace op::div::metax
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu
new file mode 100644
index 000000000..0dd91a966
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu
@@ -0,0 +1,70 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "../div_mode.h"
+#include "div_nvidia.cuh"
+
+namespace op::div::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    int mode,
+    void *stream) const {
+#define CALCULATE(OP)                                                                                           \
+    switch (_dtype) {                                                                                           \
+    case INFINI_DTYPE_F16:                                                                                      \
+        return _device_info->calculate<256, cuda::OP, half>(_info, workspace, output, inputs, stream);          \
+    case INFINI_DTYPE_BF16:                                                                                     \
+        return _device_info->calculate<256, cuda::OP, cuda_bfloat16>(_info, workspace, output, inputs, stream); \
+    case INFINI_DTYPE_F32:                                                                                      \
+        return _device_info->calculate<256, cuda::OP, float>(_info, workspace, output, inputs, stream);         \
+    case INFINI_DTYPE_F64:                                                                                      \
+        return _device_info->calculate<256, cuda::OP, double>(_info, workspace, output, inputs, stream);        \
+    default:                                                                                                    \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;                                                                  \
+    }
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    if (mode == MODE_DEFAULT) {
+        CALCULATE(DivOp)
+    } else if (mode == MODE_TRUNC) {
+        CALCULATE(DivOpTrunc)
+    } else if (mode == MODE_FLOOR) {
+        CALCULATE(DivOpFloor)
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::nvidia
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
new file mode 100644
index 000000000..9e0f7c925
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
@@ -0,0 +1,45 @@
+#ifndef __DIV_CUDA_API_H__
+#define __DIV_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+namespace op::div::nvidia {
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
+    size_t _workspace_size;
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::nvidia::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(std::move(device_info)),
+          _workspace_size(workspace_size) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs);
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        int mode,
+        void *stream) const;
+};
+} // namespace op::div::nvidia
+#endif // __DIV_CUDA_API_H__
diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc
new file mode 100644
index 000000000..5dd8ff527
--- /dev/null
+++ b/src/infiniop/ops/div/operator.cc
@@ -0,0 +1,162 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "div_mode.h"
+#include "infinicore.h"
+#include "infiniop/ops/div.h"
+#include <iostream>
+
+#ifdef ENABLE_CPU_API
+#include "cpu/div_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/div_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/div_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateDivDescriptor(
+    infiniopHandle_t handle,
+    infiniopDivDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::div::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::div::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::div::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDiv(
+    infiniopDivDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *mode_,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, mode, stream)
+
+    int mode_val = reinterpret_cast<intptr_t>(mode_);
+
+    int mode = MODE_DEFAULT;
+    if (mode_val == 0) {
+        mode = MODE_DEFAULT;
+    } else if (mode_val == 1) {
+        mode = MODE_TRUNC;
+    } else if (mode_val == 2) {
+        mode = MODE_FLOOR;
+    } else {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/gelu/cpu/gelu_cpu.cc b/src/infiniop/ops/gelu/cpu/gelu_cpu.cc
new file mode 100644
index 000000000..716cb4e3e
--- /dev/null
+++ b/src/infiniop/ops/gelu/cpu/gelu_cpu.cc
@@ -0,0 +1,50 @@
+#include "gelu_cpu.h"
+#include "infinicore.h"
+
+namespace op::gelu::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<GeluOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<GeluOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<GeluOp, bf16_t>(_info, output, inputs, stream);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::gelu::cpu
diff --git a/src/infiniop/ops/gelu/cpu/gelu_cpu.h b/src/infiniop/ops/gelu/cpu/gelu_cpu.h
new file mode 100644
index 000000000..376634fb3
--- /dev/null
+++ b/src/infiniop/ops/gelu/cpu/gelu_cpu.h
@@ -0,0 +1,49 @@
+#ifndef __GELU_CPU_H__
+#define __GELU_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(gelu, cpu)
+
+namespace op::gelu::cpu {
+typedef struct GeluOp {
+    float pi = 3.1415927f;
+
+    float kappa = 0.044715;
+    float beta = sqrt(2 / pi);
+
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &input) const {
+        // use Approximate formula Gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.0044715 * x **3)))
+        // \kappa = 0.044715, \beta = sqrt(2 / pi)
+        //  inner() = \beta * (x + \kappa * x ** 3)
+        //  SO: Gelu(x) = 0.5 * x * (1 + tanh(inner()))
+        //
+        if constexpr (std::is_same<T, fp16_t>::value) {
+            float finput = _f16_to_f32(input);
+            float inner = beta * (finput + kappa * finput * finput * finput);
+            float res = 0.5 * finput * (1 + std::tanh(inner));
+
+            return _f32_to_f16(res);
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            float finput = _bf16_to_f32(input);
+            float inner = beta * (finput + kappa * finput * finput * finput);
+            float res = 0.5 * finput * (1 + std::tanh(inner));
+
+            return _f32_to_bf16(res);
+        } else if constexpr (std::is_same<T, float>::value) {
+            float inner = beta * (input + kappa * input * input * input);
+            return 0.5 * input * (1 + std::tanh(inner));
+
+        } else {
+            float inner = beta * (input + kappa * input * input * input);
+            return 0.5 * input * (1 + std::tanh(inner));
+        }
+    }
+} GeluOp;
+} // namespace op::gelu::cpu
+
+#endif
diff --git a/src/infiniop/ops/gelu/cuda/kernel.cuh b/src/infiniop/ops/gelu/cuda/kernel.cuh
new file mode 100644
index 000000000..f04ddfe55
--- /dev/null
+++ b/src/infiniop/ops/gelu/cuda/kernel.cuh
@@ -0,0 +1,67 @@
+#ifndef __GELU_CUDA_H__
+#define __GELU_CUDA_H__
+
+namespace op::gelu::cuda {
+typedef struct GeluOp {
+private:
+    __device__ __forceinline__ half htanh_approx(const half x) const {
+        // Pade approximation: tanh(x) ≈ x * (27 + x^2) / (27 + 9*x^2)
+        // More accurate for small values
+        float fx = __half2float(x);
+        float tanh_val = tanhf(fx);
+        return __float2half(tanh_val);
+    }
+
+    __device__ __forceinline__ cuda_bfloat16 htanh_approx(const cuda_bfloat16 x) const {
+        float fx = __bfloat162float(x);
+        float tanh_val = tanhf(fx);
+        return __float2bfloat16(tanh_val);
+    }
+    float pi = 3.1415927f;
+    float kappa = 0.044715f;
+    float beta = sqrtf(2 / pi);
+
+    half h_kappa = __float2half(kappa);
+    cuda_bfloat16 b_kappa = __float2bfloat16(kappa);
+
+    half h_beta = __float2half(beta);
+    cuda_bfloat16 b_beta = __float2bfloat16(beta);
+
+    half h_point_fiv = __float2half(0.5f);
+    cuda_bfloat16 b_point_fiv = __float2bfloat16(0.5f);
+
+    half h_one = __float2half(1.0f);
+    cuda_bfloat16 b_one = __float2bfloat16(1.0f);
+
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        // use Approximate formula Gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.0044715 * x **3)))
+        //  \kappa = 0.044715, \beta = sqrt(2 / pi)
+        //  inner() = \beta * (x + \kappa * x ** 3)
+        //  Gelu(x) = 0.5 * x * (1 + tanh(inner()))
+
+        if constexpr (std::is_same_v<T, half>) {
+            half inner = __hmul(h_beta, __hadd(input, __hmul(h_kappa, __hmul(input, __hmul(input, input)))));
+            half res = __hmul(h_point_fiv, __hmul(input, __hadd(h_one, htanh_approx(inner))));
+
+            return res;
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            cuda_bfloat16 inner = __hmul(b_beta, __hadd(input, __hmul(b_kappa, __hmul(input, __hmul(input, input)))));
+            cuda_bfloat16 res = __hmul(b_point_fiv, __hmul(input, __hadd(b_one, htanh_approx(inner))));
+
+            return res;
+        } else if constexpr (std::is_same_v<T, float>) {
+            float inner = __fmul_rn(beta, __fadd_rn(input, __fmul_rn(kappa, __fmul_rn(input, __fmul_rn(input, input)))));
+            float res = __fmul_rn(0.5f, __fmul_rn(input, __fadd_rn(1.0f, tanhf(inner))));
+
+            return res;
+        } else {
+            return 0.5 * input * (1 + beta * (input + kappa * powf(input, 3)));
+        }
+    };
+} GeluOp;
+} // namespace op::gelu::cuda
+
+#endif // __silu_CUDA_H__
diff --git a/src/infiniop/ops/gelu/metax/gelu_metax.h b/src/infiniop/ops/gelu/metax/gelu_metax.h
new file mode 100644
index 000000000..53ff79f10
--- /dev/null
+++ b/src/infiniop/ops/gelu/metax/gelu_metax.h
@@ -0,0 +1,8 @@
+#ifndef __GELU_METAX_API_H__
+#define __GELU_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(gelu, metax)
+
+#endif // __SWIGLU_METAX_API_H__
diff --git a/src/infiniop/ops/gelu/metax/gelu_metax.maca b/src/infiniop/ops/gelu/metax/gelu_metax.maca
new file mode 100644
index 000000000..4e9b4de08
--- /dev/null
+++ b/src/infiniop/ops/gelu/metax/gelu_metax.maca
@@ -0,0 +1,55 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "gelu_metax.h"
+
+namespace op::gelu::metax{
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs);
+
+    return INFINI_STATUS_SUCCESS;
+    
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::GeluOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::GeluOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::GeluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu_metax::metax
diff --git a/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu
new file mode 100644
index 000000000..dae788e45
--- /dev/null
+++ b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu
@@ -0,0 +1,55 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "gelu_nvidia.cuh"
+#include "infinicore.h"
+
+namespace op::gelu::nvidia {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // std::cout << "at calutate workspace: " << workspace << std::endl;
+    // std::cout << "at calutate workspace sieze: " << workspace_size << std::endl;
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::GeluOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::GeluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::GeluOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu::nvidia
diff --git a/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh
new file mode 100644
index 000000000..40f75a967
--- /dev/null
+++ b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __GELU_CUDA_API_H__
+#define __GELU_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(gelu, nvidia)
+
+#endif
diff --git a/src/infiniop/ops/gelu/operator.cc b/src/infiniop/ops/gelu/operator.cc
new file mode 100644
index 000000000..c9d64b422
--- /dev/null
+++ b/src/infiniop/ops/gelu/operator.cc
@@ -0,0 +1,132 @@
+#include "../../handle.h"
+#include "infinicore.h"
+#include "infiniop/ops/gelu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/gelu_cpu.h"
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#include "nvidia/gelu_nvidia.cuh"
+#endif
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/gelu_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/gelu_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateGeluDescriptor(
+    infiniopHandle_t handle,
+    infiniopGeluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+#define CTEATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::gelu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::gelu::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CTEATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CTEATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CTEATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CTEATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CTEATE
+}
+
+__C infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::gelu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopGelu(
+    infiniopGeluDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::gelu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::gelu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
diff --git a/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc
new file mode 100644
index 000000000..fc13bf7c6
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc
@@ -0,0 +1,52 @@
+#include "gelu_backward_cpu.h"
+#include "infinicore.h"
+
+namespace op::gelu_backward::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(0);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<GeluBackWardOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<GeluBackWardOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<GeluBackWardOp, bf16_t>(_info, output, inputs, stream);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::gelu_backward::cpu
diff --git a/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h
new file mode 100644
index 000000000..18a56cf86
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h
@@ -0,0 +1,52 @@
+#ifndef __GELU_BACKWARD_CPU_H__
+#define __GELU_BACKWARD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(gelu_backward, cpu)
+
+namespace op::gelu_backward::cpu {
+typedef struct GeluBackWardOp {
+    float pi = 3.1415927f;
+    float kappa = 0.044715f;
+    float Beta = sqrt(2 / pi);
+
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &input, const T &grad_output) const {
+        // use Approximate formula Gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.0044715 * x ** 3 )))
+        // \kappa = 0.044715, \beta = sqrt(2 / pi)
+        //  inner() = \beta * (x + \kappa * x ** 3)
+        //  SO: Gelu(x) = 0.5 * x * (1 + tanh(inner()))
+        //
+        // d(Gelu(x)) = 0.5 [(1 + tanh(inner())) + x(1 - tanh(inner) ** 2)(d(inner()))]
+#define CREATE_D_DEUL(x)                           \
+    float inner = Beta * (x + kappa * x * x * x);  \
+    float dinner = Beta * (1 + 3 * kappa * x * x); \
+    float tanh_inner = std::tanh(inner);           \
+    float dGelu = 0.5 * ((1 + tanh_inner) + x * (1 - tanh_inner * tanh_inner) * dinner);
+
+        if constexpr (std::is_same<T, fp16_t>::value) {
+            float finput = _f16_to_f32(input);
+            CREATE_D_DEUL(finput)
+
+            return grad_output * _f32_to_f16(dGelu);
+
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            float finput = _bf16_to_f32(input);
+            CREATE_D_DEUL(finput)
+
+            return grad_output * _f32_to_bf16(dGelu);
+
+        } else if constexpr (std::is_same<T, float>::value) {
+            CREATE_D_DEUL(input)
+
+            return grad_output * dGelu;
+        }
+    }
+} GeluBackWardOp;
+} // namespace op::gelu_backward::cpu
+
+#endif
diff --git a/src/infiniop/ops/gelu_backward/cuda/kernel.cuh b/src/infiniop/ops/gelu_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..7b92a2d55
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/cuda/kernel.cuh
@@ -0,0 +1,88 @@
+#ifndef __gelu_backward_CUDA_H__
+#define __gelu_backward_CUDA_H__
+
+namespace op::gelu_backward::cuda {
+typedef struct GeluBackWardOp {
+private:
+    // MetaX tanh implementations for half and bfloat16 types
+    __device__ __forceinline__ half htanh_approx(const half x) const {
+        // Pade approximation: tanh(x) ≈ x * (27 + x^2) / (27 + 9*x^2)
+        // More accurate for small values
+        half x2 = __hmul(x, x);
+        half numerator = __hmul(x, __hadd(__float2half(27.0f), x2));
+        half denominator = __hadd(__float2half(27.0f), __hmul(__float2half(9.0f), x2));
+        return __hdiv(numerator, denominator);
+    }
+
+    __device__ __forceinline__ cuda_bfloat16 htanh(const cuda_bfloat16 x) const {
+        // For bfloat16, convert to float, compute tanh, and convert back
+        float xf = __bfloat162float(x);
+        float tanh_val = tanhf(xf);
+        return __float2bfloat16(tanh_val);
+    }
+    float pi = 3.1415927f;
+    float kappa = 0.044715f;
+    float beta = sqrtf(2 / pi);
+
+    half h_kappa = __float2half(kappa);
+    cuda_bfloat16 b_kappa = __float2bfloat16(kappa);
+
+    half h_beta = __float2half(beta);
+    cuda_bfloat16 b_beta = __float2bfloat16(beta);
+
+    half h_point_fiv = __float2half(0.5f);
+    cuda_bfloat16 b_point_fiv = __float2bfloat16(0.5f);
+
+    half h_one = __float2half(1.0f);
+    cuda_bfloat16 b_one = __float2bfloat16(1.0f);
+
+    half h_three = __float2half(3.0f);
+    cuda_bfloat16 b_three = __float2bfloat16(3.0f);
+
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input, const T &grad_output) const {
+        // use Approximate formula GeluBackWard(x) = 0.5 [(1 + tanh(inner())) + x(1 - tanh(inner) ** 2)(d(inner()))]
+        // use Approximate formula Gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.0044715 * x ** 3 )))
+        //  \kappa = 0.044715, \beta = sqrt(2 / pi)
+        //  inner() = \beta * (x + \kappa * x ** 3)
+        //  dinner() = \beta * (1 + 3 * \kappa * x ** 2)
+        //  Gelu(x) = 0.5 * x * (1 + tanh(inner()))
+        //  d(Gelu(x)) = 0.5 [(1 + tanh(inner())) + x(1 - tanh(inner) ** 2)(d(inner()))]
+        //
+        //  GeluBackWard(x) = grad_out  * d(Gelu(x))
+
+        if constexpr (std::is_same_v<T, half>) {
+            half inner = __hmul(h_beta, __hadd(input, __hmul(h_kappa, __hmul(input, __hmul(input, input)))));
+            half dinner = __hmul(h_beta, __hadd(h_one, __hmul(h_three, __hmul(h_kappa, __hmul(input, input)))));
+            half tanh = htanh_approx(inner);
+            half dGelu = __hmul(h_point_fiv, __hadd(h_one, __hadd(tanh, __hmul(input, __hmul(__hadd(h_one, __hneg(__hmul(tanh, tanh))), dinner)))));
+
+            return __hmul(grad_output, dGelu);
+
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            cuda_bfloat16 inner = __hmul(b_beta, __hadd(input, __hmul(b_kappa, __hmul(input, __hmul(input, input)))));
+            cuda_bfloat16 dinner = __hmul(b_beta, __hadd(b_one, __hmul(b_three, __hmul(b_kappa, __hmul(input, input)))));
+            cuda_bfloat16 tanh = htanh(inner);
+            cuda_bfloat16 dGelu = __hmul(b_point_fiv, __hadd(b_one, __hadd(tanh, __hmul(input, __hmul(__hadd(b_one, __hneg(__hmul(tanh, tanh))), dinner)))));
+
+            return __hmul(grad_output, dGelu);
+
+        } else if constexpr (std::is_same_v<T, float>) {
+
+            float inner = __fmul_rn(beta, __fadd_rn(input, __fmul_rn(kappa, __fmul_rn(input, __fmul_rn(input, input)))));
+            float dinner = __fmul_rn(beta, __fadd_rn(1.0f, __fmul_rn(3.0f, __fmul_rn(kappa, __fmul_rn(input, input)))));
+            float tanh = tanhf(inner);
+            float dGelu = __fmul_rn(0.5f, __fadd_rn(1.0f, __fadd_rn(tanh, __fmul_rn(input, __fmul_rn(__fsub_rn(1.0f, __fmul_rn(tanh, tanh)), dinner)))));
+
+            return __fmul_rn(grad_output, dGelu);
+
+        } else {
+            return T(0.0);
+        }
+    };
+} GeluBackWardOp;
+} // namespace op::gelu_backward::cuda
+
+#endif // __silu_CUDA_H__
diff --git a/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h
new file mode 100644
index 000000000..201ab3d55
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __GELU_BACKWARD_METAX_API_H__
+#define __GELU_BACKWARD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(gelu_backward, metax)
+
+#endif // __SWIGLU_METAX_API_H__
diff --git a/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca
new file mode 100644
index 000000000..34dc173e3
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca
@@ -0,0 +1,55 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "gelu_backward_metax.h"
+
+namespace op::gelu_backward::metax{
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto &b_desc = input_descs.at(1);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+    const auto b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create metax elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::GeluBackWardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::GeluBackWardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::GeluBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu_backward::metax
diff --git a/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu
new file mode 100644
index 000000000..570dcfcd7
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu
@@ -0,0 +1,55 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "gelu_backward_nvidia.cuh"
+#include "infinicore.h"
+
+namespace op::gelu_backward::nvidia {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(0);
+    const auto &grad_input_shape = output_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::GeluBackWardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::GeluBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::GeluBackWardOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu_backward::nvidia
diff --git a/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh
new file mode 100644
index 000000000..7b6cd9363
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __GELU_BACKWARD_CUDA_API_H__
+#define __GELU_BACKWARD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(gelu_backward, nvidia)
+
+#endif
diff --git a/src/infiniop/ops/gelu_backward/operator.cc b/src/infiniop/ops/gelu_backward/operator.cc
new file mode 100644
index 000000000..97da6465f
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/operator.cc
@@ -0,0 +1,131 @@
+#include "../../handle.h"
+#include "infinicore.h"
+#include "infiniop/ops/gelu_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/gelu_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/gelu_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/gelu_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateGeluBackWardDescriptor(
+    infiniopHandle_t handle,
+    infiniopGeluBackWardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t grad_output_desc) {
+#define CTEATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::gelu_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::gelu_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                         \
+            {input_desc, grad_output_desc})
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CTEATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CTEATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CTEATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CTEATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CTEATE
+}
+
+__C infiniStatus_t infiniopGetGeluBackWardWorkspaceSize(infiniopGeluBackWardDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<op::gelu_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopGeluBackWard(
+    infiniopGeluBackWardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *input,
+    const void *grad_output,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::gelu_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyGeluBackWardDescriptor(infiniopGeluBackWardDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::gelu_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
new file mode 100644
index 000000000..70f1afe7d
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
@@ -0,0 +1,51 @@
+#include "logical_and_cpu.h"
+#include "infinicore.h"
+
+namespace op::logical_and::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<LogicalAndOp, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<LogicalAndOp, int8_t>(_info, output, inputs, stream);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::logical_and::cpu
diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
new file mode 100644
index 000000000..52dc0477e
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __LOGICAL_AND_CPU_H__
+#define __LOGICAL_AND_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, cpu)
+
+namespace op::logical_and::cpu {
+typedef struct LogicalAndOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a && b;
+    }
+} LogicalAndOp;
+} // namespace op::logical_and::cpu
+
+#endif
diff --git a/src/infiniop/ops/logical_and/cuda/kernel.cuh b/src/infiniop/ops/logical_and/cuda/kernel.cuh
new file mode 100644
index 000000000..1e280165b
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cuda/kernel.cuh
@@ -0,0 +1,15 @@
+#ifndef __LOGICAL_AND_CUDA_H__
+#define __LOGICAL_AND_CUDA_H__
+
+namespace op::logical_and::cuda {
+typedef struct LogicalAndOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return a && b;
+    }
+} LogicalAndOp;
+} // namespace op::logical_and::cuda
+
+#endif // __logical_and_CUDA_H__
diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.h b/src/infiniop/ops/logical_and/metax/logical_and_metax.h
new file mode 100644
index 000000000..e94c5fd15
--- /dev/null
+++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_AND_METAX_API_H__
+#define __LOGICAL_AND_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, metax)
+
+#endif
diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.maca b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca
new file mode 100644
index 000000000..8be401a2e
--- /dev/null
+++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca
@@ -0,0 +1,53 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_and_metax.h"
+
+namespace op::logical_and::metax {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto &b_desc = input_descs.at(1);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+    const auto b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalAndOp, int8_t>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_and::metax
diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
new file mode 100644
index 000000000..a4567a844
--- /dev/null
+++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_and_nvidia.cuh"
+#include <cstdint>
+
+namespace op::logical_and::nvidia {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto &b_desc = input_descs.at(1);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+    const auto b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalAndOp, int8_t>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_and::nvidia
diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
new file mode 100644
index 000000000..8884f9d0b
--- /dev/null
+++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_AND_CUDA_API_H__
+#define __LOGICAL_AND_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, nvidia)
+
+#endif
diff --git a/src/infiniop/ops/logical_and/operator.cc b/src/infiniop/ops/logical_and/operator.cc
new file mode 100644
index 000000000..e72798bee
--- /dev/null
+++ b/src/infiniop/ops/logical_and/operator.cc
@@ -0,0 +1,134 @@
+#include "../../handle.h"
+#include "infinicore.h"
+#include "infiniop/ops/logical_and.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_and_cpu.h"
+#endif
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/logical_and_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/logical_and_metax.h"
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#include "nvidia/logical_and_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateLogicalAndDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogicalAndDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        return op::logical_and::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                \
+            reinterpret_cast<op::logical_and::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                                \
+            {a_desc, b_desc})
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                       \
+    case CASE:                                                                                     \
+        *size = reinterpret_cast<op::logical_and::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopLogicalAnd(
+    infiniopLogicalAndDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                        \
+        return reinterpret_cast<const op::logical_and::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                         \
+        delete reinterpret_cast<const op::logical_and::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
diff --git a/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.cc b/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.cc
new file mode 100644
index 000000000..cda411728
--- /dev/null
+++ b/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.cc
@@ -0,0 +1,64 @@
+#include "logical_equal_cpu.h"
+#include "infinicore.h"
+#include <cmath>
+#include <cstdint>
+
+namespace op::logical_equal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16,
+                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+#define CASE(CASE, TYPE) \
+    case CASE:           \
+        return _device_info->calculate<LogicalEqualOp, TYPE>(_info, output, inputs, stream);
+
+    switch (_dtype) {
+        CASE(INFINI_DTYPE_BOOL, bool)
+        CASE(INFINI_DTYPE_I8, int8_t)
+        CASE(INFINI_DTYPE_I16, int16_t)
+        CASE(INFINI_DTYPE_I32, int32_t)
+        CASE(INFINI_DTYPE_I64, int64_t)
+        CASE(INFINI_DTYPE_F16, fp16_t)
+        CASE(INFINI_DTYPE_BF16, bf16_t)
+        CASE(INFINI_DTYPE_F32, float)
+        CASE(INFINI_DTYPE_F64, double_t)
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::logical_equal::cpu
diff --git a/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.h b/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.h
new file mode 100644
index 000000000..a8c5088a0
--- /dev/null
+++ b/src/infiniop/ops/logical_equal/cpu/logical_equal_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __LOGICAL_EQUAL_CPU_H__
+#define __LOGICAL_EQUAL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_equal, cpu)
+
+namespace op::logical_equal::cpu {
+typedef struct LogicalEqualOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a == b;
+    }
+} LogicalEqualOp;
+} // namespace op::logical_equal::cpu
+
+#endif
diff --git a/src/infiniop/ops/logical_equal/cuda/kernel.cuh b/src/infiniop/ops/logical_equal/cuda/kernel.cuh
new file mode 100644
index 000000000..ee92a46bd
--- /dev/null
+++ b/src/infiniop/ops/logical_equal/cuda/kernel.cuh
@@ -0,0 +1,15 @@
+#ifndef __LOGICAL_EQUAL_CUDA_H__
+#define __LOGICAL_EQUAL_CUDA_H__
+
+namespace op::logical_equal::cuda {
+typedef struct LogicalEqualOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return a == b;
+    }
+} LogicalEqualOp;
+} // namespace op::logical_equal::cuda
+
+#endif // __logical_equal_CUDA_H__
diff --git a/src/infiniop/ops/logical_equal/metax/logical_equal_metax.h b/src/infiniop/ops/logical_equal/metax/logical_equal_metax.h
new file mode 100644
index 000000000..8db9a7083
--- /dev/null
+++ b/src/infiniop/ops/logical_equal/metax/logical_equal_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_EQUAL_METAX_API_H__
+#define __LOGICAL_EQUAL_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_equal, metax)
+
+#endif
diff --git a/src/infiniop/ops/logical_equal/metax/logical_equal_metax.maca b/src/infiniop/ops/logical_equal/metax/logical_equal_metax.maca
new file mode 100644
index 000000000..b5763b004
--- /dev/null
+++ b/src/infiniop/ops/logical_equal/metax/logical_equal_metax.maca
@@ -0,0 +1,65 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_equal_metax.h"
+#include <cstdint>
+
+namespace op::logical_equal::metax {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto &b_desc = input_descs.at(1);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+    const auto b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16,
+                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+#define CASE(CASE, TYPE) \
+    case CASE:           \
+        return _device_info->calculate<256, cuda::LogicalEqualOp, TYPE>(_info, workspace, output, inputs, stream);
+
+    switch (_dtype) {
+        CASE(INFINI_DTYPE_BOOL, bool)
+        CASE(INFINI_DTYPE_I8, int8_t)
+        CASE(INFINI_DTYPE_I16, int16_t)
+        CASE(INFINI_DTYPE_I32, int32_t)
+        CASE(INFINI_DTYPE_I64, int64_t)
+        CASE(INFINI_DTYPE_F16, half)
+        CASE(INFINI_DTYPE_BF16, cuda_bfloat16)
+        CASE(INFINI_DTYPE_F32, float)
+        CASE(INFINI_DTYPE_F64, double_t)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_equal::metax
diff --git a/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cu b/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cu
new file mode 100644
index 000000000..27076902e
--- /dev/null
+++ b/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cu
@@ -0,0 +1,65 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_equal_nvidia.cuh"
+#include <cstdint>
+
+namespace op::logical_equal::nvidia {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto &b_desc = input_descs.at(1);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+    const auto b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16,
+                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+#define CASE(CASE, TYPE) \
+    case CASE:           \
+        return _device_info->calculate<256, cuda::LogicalEqualOp, TYPE>(_info, workspace, output, inputs, stream);
+
+    switch (_dtype) {
+        CASE(INFINI_DTYPE_BOOL, bool)
+        CASE(INFINI_DTYPE_I8, int8_t)
+        CASE(INFINI_DTYPE_I16, int16_t)
+        CASE(INFINI_DTYPE_I32, int32_t)
+        CASE(INFINI_DTYPE_I64, int64_t)
+        CASE(INFINI_DTYPE_F16, half)
+        CASE(INFINI_DTYPE_BF16, cuda_bfloat16)
+        CASE(INFINI_DTYPE_F32, float)
+        CASE(INFINI_DTYPE_F64, double_t)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_equal::nvidia
diff --git a/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cuh b/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cuh
new file mode 100644
index 000000000..350c37445
--- /dev/null
+++ b/src/infiniop/ops/logical_equal/nvidia/logical_equal_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_EQUAL_CUDA_API_H__
+#define __LOGICAL_EQUAL_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_equal, nvidia)
+
+#endif
diff --git a/src/infiniop/ops/logical_equal/operator.cc b/src/infiniop/ops/logical_equal/operator.cc
new file mode 100644
index 000000000..6f39a9829
--- /dev/null
+++ b/src/infiniop/ops/logical_equal/operator.cc
@@ -0,0 +1,134 @@
+#include "../../handle.h"
+#include "infinicore.h"
+#include "infiniop/ops/logical_equal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_equal_cpu.h"
+#endif
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/logical_equal_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/logical_equal_metax.h"
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#include "nvidia/logical_equal_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateLogicalEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogicalEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::logical_equal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::logical_equal::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                                  \
+            {a_desc, b_desc})
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogicalEqualWorkspaceSize(infiniopLogicalEqualDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<op::logical_equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopLogicalEqual(
+    infiniopLogicalEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::logical_equal::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyLogicalEqualDescriptor(infiniopLogicalEqualDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::logical_equal::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
new file mode 100644
index 000000000..4c92f5efe
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
@@ -0,0 +1,51 @@
+#include "logical_or_cpu.h"
+#include "infinicore.h"
+
+namespace op::logical_or::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<LogicalOrOp, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<LogicalOrOp, int8_t>(_info, output, inputs, stream);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::logical_or::cpu
diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
new file mode 100644
index 000000000..0329a0190
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __LOGICAL_OR_CPU_H__
+#define __LOGICAL_OR_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, cpu)
+
+namespace op::logical_or::cpu {
+typedef struct LogicalOrOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a || b;
+    }
+} LogicalOrOp;
+} // namespace op::logical_or::cpu
+
+#endif
diff --git a/src/infiniop/ops/logical_or/cuda/kernel.cuh b/src/infiniop/ops/logical_or/cuda/kernel.cuh
new file mode 100644
index 000000000..8b2deeb1c
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cuda/kernel.cuh
@@ -0,0 +1,15 @@
+#ifndef __LOGICAL_OR_CUDA_H__
+#define __LOGICAL_OR_CUDA_H__
+
+namespace op::logical_or::cuda {
+typedef struct LogicalOrOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        return a || b;
+    }
+} LogicalOrOp;
+} // namespace op::logical_or::cuda
+
+#endif // __LOGICAL_OR_CUDA_H__
diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.h b/src/infiniop/ops/logical_or/metax/logical_or_metax.h
new file mode 100644
index 000000000..ec7d9c73b
--- /dev/null
+++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_OR_METAX_API_H__
+#define __LOGICAL_OR_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, metax)
+
+#endif
diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.maca b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca
new file mode 100644
index 000000000..eda4b077b
--- /dev/null
+++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca
@@ -0,0 +1,53 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_or_metax.h"
+
+namespace op::logical_or::metax {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto &b_desc = input_descs.at(1);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+    const auto b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalOrOp, int8_t>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_or::metax
diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
new file mode 100644
index 000000000..1725fbca0
--- /dev/null
+++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_or_nvidia.cuh"
+#include <cstdint>
+
+namespace op::logical_or::nvidia {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto &b_desc = input_descs.at(1);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+    const auto b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalOrOp, int8_t>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_or::nvidia
diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
new file mode 100644
index 000000000..07731049c
--- /dev/null
+++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_OR_CUDA_API_H__
+#define __LOGICAL_OR_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, nvidia)
+
+#endif
diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc
new file mode 100644
index 000000000..5a6ecd35d
--- /dev/null
+++ b/src/infiniop/ops/logical_or/operator.cc
@@ -0,0 +1,134 @@
+#include "../../handle.h"
+#include "infinicore.h"
+#include "infiniop/ops/logical_or.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_or_cpu.h"
+#endif
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/logical_or_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/logical_or_metax.h"
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#include "nvidia/logical_or_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateLogicalOrDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogicalOrDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::logical_or::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::logical_or::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                               \
+            {a_desc, b_desc})
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::logical_or::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopLogicalOr(
+    infiniopLogicalOrDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::logical_or::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::logical_or::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
diff --git a/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc
new file mode 100644
index 000000000..88d998f34
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc
@@ -0,0 +1,54 @@
+#include "relu_backward_cpu.h"
+
+namespace op::relu_backward::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(0);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ReluOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ReluOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ReluOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ReluOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu_backward::cpu
diff --git a/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h
new file mode 100644
index 000000000..ab70b3eed
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __RELU_BACKWARD_CPU_H__
+#define __RELU_BACKWARD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(relu_backward, cpu)
+
+namespace op::relu_backward::cpu {
+typedef struct ReluOp {
+public:
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    T operator()(const T &input, const T &grad_output) const {
+        auto dy = T(1) ? input > T(0) : T(0);
+        return grad_output * dy;
+    }
+} ReluOp;
+} // namespace op::relu_backward::cpu
+
+#endif // __RELU_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/relu_backward/cuda/kernel.cuh b/src/infiniop/ops/relu_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..30aae9ef8
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/cuda/kernel.cuh
@@ -0,0 +1,24 @@
+#ifndef __RELU_BACKWARD_CUDA_H__
+#define __RELU_BACKWARD_CUDA_H__
+
+namespace op::relu_backward::cuda {
+typedef struct ReluBackWardOp {
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input, const T &grad_output) const {
+        auto dy = input > T(0) ? T(1) : T(0);
+        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, cuda_bfloat162>) {
+            return __hmul2(grad_output, dy);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return __hmul(grad_output, dy);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fmul_rn(grad_output, dy);
+        } else {
+            return grad_output * dy;
+        }
+    }
+} ReluBackWardOp;
+
+} // namespace op::relu_backward::cuda
+
+#endif // __RELU_BACKWARD_CUDA_H__
diff --git a/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h
new file mode 100644
index 000000000..13c41b83a
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __RELU_BACKWARD_METAX_API_H__
+#define __RELU_BACKWARD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(relu_backward, metax)
+
+#endif // __RELU_BACKWARD_METAX_API_H__
diff --git a/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca
new file mode 100644
index 000000000..79e38743e
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca
@@ -0,0 +1,60 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+#include "../cuda/kernel.cuh"
+#include "relu_backward_metax.h"
+
+namespace op::relu_backward::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(0);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ReluBackWardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ReluBackWardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ReluBackWardOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ReluBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+  
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu_backward::metax
+
diff --git a/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu
new file mode 100644
index 000000000..9d67cb1e5
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu
@@ -0,0 +1,56 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "relu_backward_nvidia.cuh"
+
+namespace op::relu_backward::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(0);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ReluBackWardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ReluBackWardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ReluBackWardOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ReluBackWardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::relu_backward::nvidia
diff --git a/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh
new file mode 100644
index 000000000..b29db1540
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __RELU_NVIDIA_API_H__
+#define __RELU_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(relu_backward, nvidia)
+
+#endif // __RELU_NVIDIA_API_H__
diff --git a/src/infiniop/ops/relu_backward/operator.cc b/src/infiniop/ops/relu_backward/operator.cc
new file mode 100644
index 000000000..3b1c92730
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/operator.cc
@@ -0,0 +1,144 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/relu_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/relu_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/relu_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/relu_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateReluBackWardDescriptor(
+    infiniopHandle_t handle,
+    infiniopReluBackWardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t grad_output_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::relu_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::relu_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                         \
+            {input_desc, grad_output_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetReluBackWardWorkspaceSize(infiniopReluBackWardDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<op::relu_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReluBackWard(
+    infiniopReluBackWardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *input,
+    const void *grad_output,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::relu_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyReluBackWardDescriptor(infiniopReluBackWardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::relu_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/silu/cpu/silu_cpu.cc b/src/infiniop/ops/silu/cpu/silu_cpu.cc
new file mode 100644
index 000000000..c005d7a80
--- /dev/null
+++ b/src/infiniop/ops/silu/cpu/silu_cpu.cc
@@ -0,0 +1,50 @@
+#include "silu_cpu.h"
+#include "infinicore.h"
+
+namespace op::silu::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SiluOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SiluOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SiluOp, bf16_t>(_info, output, inputs, stream);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::silu::cpu
diff --git a/src/infiniop/ops/silu/cpu/silu_cpu.h b/src/infiniop/ops/silu/cpu/silu_cpu.h
new file mode 100644
index 000000000..8ca15afee
--- /dev/null
+++ b/src/infiniop/ops/silu/cpu/silu_cpu.h
@@ -0,0 +1,25 @@
+#ifndef __SILU_CPU_H__
+#define __SILU_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(silu, cpu)
+
+namespace op::silu::cpu {
+typedef struct SiluOp {
+private:
+    template <typename T>
+    T sigmoid(const T &x) const {
+        return T(1) / (T(1) + std::exp(-x));
+    }
+
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return x * sigmoid(x);
+    }
+} SiluOp;
+} // namespace op::silu::cpu
+
+#endif
diff --git a/src/infiniop/ops/silu/cuda/kernel.cuh b/src/infiniop/ops/silu/cuda/kernel.cuh
new file mode 100644
index 000000000..a239619b8
--- /dev/null
+++ b/src/infiniop/ops/silu/cuda/kernel.cuh
@@ -0,0 +1,65 @@
+#ifndef __SILU_CUDA_H__
+#define __SILU_CUDA_H__
+
+namespace op::silu::cuda {
+typedef struct SiluOp {
+private:
+    template <typename T>
+    __device__ __forceinline__ T sigmoid(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            float sig0 = __frcp_rn(__fadd_rn(1.0f, __expf(-x0)));
+            float sig1 = __frcp_rn(__fadd_rn(1.0f, __expf(-x1)));
+            return __floats2bfloat162_rn(sig0, sig1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(x);
+            return __float2bfloat16_rn(__frcp_rn(__fadd_rn(1.0f, __expf(-xf))));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __frcp_rn(__fadd_rn(1, __expf(-x)));
+        } else {
+            return 1 / (1 + std::exp(x));
+        }
+    }
+
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmul2(x, sigmoid(x));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hmul(x, sigmoid(x));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            cuda_bfloat162 sig = sigmoid(x);
+
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            float sig0 = __bfloat162float(__low2bfloat16(sig));
+            float sig1 = __bfloat162float(__high2bfloat16(sig));
+
+            float res0 = __fmul_rn(x0, sig0);
+            float res1 = __fmul_rn(x1, sig1);
+            return __floats2bfloat162_rn(res0, res1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            cuda_bfloat16 sig = sigmoid(x);
+
+            float xf = __bfloat162float(x);
+            float sigf = __bfloat162float(sig);
+
+            float res = __fmul_rn(xf, sigf);
+            return __float2bfloat16_rn(res);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fmul_rn(x, sigmoid(x));
+        } else {
+            return x * sigmoid(x);
+        }
+    };
+} SiluOp;
+} // namespace op::silu::cuda
+
+#endif // __silu_CUDA_H__
diff --git a/src/infiniop/ops/silu/metax/silu_metax.h b/src/infiniop/ops/silu/metax/silu_metax.h
new file mode 100644
index 000000000..4d66b2ead
--- /dev/null
+++ b/src/infiniop/ops/silu/metax/silu_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SILU_METAX_API_H__
+#define __SILU_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(silu, metax)
+
+#endif // __SWIGLU_METAX_API_H__
diff --git a/src/infiniop/ops/silu/metax/silu_metax.maca b/src/infiniop/ops/silu/metax/silu_metax.maca
new file mode 100644
index 000000000..6f93d67c7
--- /dev/null
+++ b/src/infiniop/ops/silu/metax/silu_metax.maca
@@ -0,0 +1,53 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "silu_metax.h"
+
+namespace op::silu::metax{
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape);
+
+    // create metax elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, output_desc, input_descs)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::silu::metax
diff --git a/src/infiniop/ops/silu/nvidia/silu_nvidia.cu b/src/infiniop/ops/silu/nvidia/silu_nvidia.cu
new file mode 100644
index 000000000..a3751a58c
--- /dev/null
+++ b/src/infiniop/ops/silu/nvidia/silu_nvidia.cu
@@ -0,0 +1,53 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "silu_nvidia.cuh"
+
+namespace op::silu::nvidia {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  std::vector<infiniopTensorDescriptor_t> input_descs) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    const auto &a_desc = input_descs.at(0);
+    const auto c_shape = output_desc->shape();
+    const auto a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, output_desc, input_descs)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::silu::nvidia
diff --git a/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh b/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh
new file mode 100644
index 000000000..ba01648ae
--- /dev/null
+++ b/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SILU_CUDA_API_H__
+#define __SILU_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(silu, nvidia)
+
+#endif
diff --git a/src/infiniop/ops/silu/operator.cc b/src/infiniop/ops/silu/operator.cc
new file mode 100644
index 000000000..2ce72c8ab
--- /dev/null
+++ b/src/infiniop/ops/silu/operator.cc
@@ -0,0 +1,132 @@
+#include "../../handle.h"
+#include "infinicore.h"
+#include "infiniop/ops/silu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/silu_cpu.h"
+#endif
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/silu_nvidia.cuh"
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#include "nvidia/silu_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/silu_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSiluDescriptor(
+    infiniopHandle_t handle,
+    infiniopSiluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+#define CTEATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::silu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::silu::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CTEATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CTEATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CTEATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CTEATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CTEATE
+}
+
+__C infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::silu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopSilu(
+    infiniopSiluDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::silu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::silu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
diff --git a/test/infiniop/crossentropyloss_backward.py b/test/infiniop/crossentropyloss_backward.py
new file mode 100644
index 000000000..755d19a9c
--- /dev/null
+++ b/test/infiniop/crossentropyloss_backward.py
@@ -0,0 +1,191 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+import math
+
+import torch
+from torch.testing import assert_close
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+    INPLACE_X2 = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+    Inplace.INPLACE_X2,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]
+# _TENSOR_DTYPES = [InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def crossentropyloss_backward(probs, target):
+    shape = probs.shape
+    N = shape.numel() / shape[-1]
+    return (probs - target) / N
+
+
+def test(
+    handle,
+    device,
+    shape,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(
+        shape,
+        None,
+        dtype,
+        device,
+    )
+    grad_output = TestTensor(
+        shape,
+        None,
+        dtype,
+        device,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = input
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing crossentropyloss_backward on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = crossentropyloss_backward(input.torch_tensor(), grad_output.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCrossEntropyLossBackWardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            input.descriptor,
+            grad_output.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [y, input, grad_output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCrossEntropyLossBackWardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_crossentropyloss_backward():
+        LIBINFINIOP.infiniopCrossEntropyLossBackWard(
+            descriptor,
+            workspace.data(),
+            workspace.size(),
+            y.data(),
+            input.data(),
+            grad_output.data(),
+            None,
+        )
+
+    lib_crossentropyloss_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    # print(ans)
+    # print(y.actual_tensor())
+    assert_close(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: crossentropyloss_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_crossentropyloss_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(
+        LIBINFINIOP.infiniopDestroyCrossEntropyLossBackWardDescriptor(descriptor)
+    )
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/div.py b/test/infiniop/div.py
new file mode 100644
index 000000000..2c3e17c30
--- /dev/null
+++ b/test/infiniop/div.py
@@ -0,0 +1,394 @@
+import torch
+from torch.testing import assert_close
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    InfiniDivMode,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,) + (mode,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+    # for mode in InfiniDivMode
+    for mode in InfiniDivMode
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def div(c, a, b, mode):
+    torch_mode = [None, "trunc", "floor"]
+
+    torch.div(a, b, rounding_mode=torch_mode[mode.value], out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    mode=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}, Mode: {mode}"
+    )
+
+    div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(), mode)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDivDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDivWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_div():
+        check_error(
+            LIBINFINIOP.infiniopDiv(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                mode,
+                None,
+            )
+        )
+
+    lib_div()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if (dtype == InfiniDtype.F32) and (mode == InfiniDivMode.FLOOR):
+        atol = 1
+
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    assert_close(
+        c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(),mode), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
+import torch
+from torch.testing import assert_close
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    InfiniDivMode,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,) + (mode,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+    # for mode in InfiniDivMode
+    for mode in InfiniDivMode
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def div(c, a, b, mode):
+    torch_mode = [None, "trunc", "floor"]
+
+    torch.div(a, b, rounding_mode=torch_mode[mode.value], out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    mode=None,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}, Mode: {mode}"
+    )
+
+    div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(), mode)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDivDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDivWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_div():
+        check_error(
+            LIBINFINIOP.infiniopDiv(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                mode,
+                None,
+            )
+        )
+
+    lib_div()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if (dtype == InfiniDtype.F32) and (mode == InfiniDivMode.FLOOR):
+        atol = 1
+
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    assert_close(
+        c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(),mode), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/gelu.py b/test/infiniop/gelu.py
new file mode 100644
index 000000000..9c6b36fee
--- /dev/null
+++ b/test/infiniop/gelu.py
@@ -0,0 +1,161 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def gelu(x):
+    return torch.nn.functional.gelu(x, approximate="tanh").to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+):
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing gelu on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = gelu(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGeluDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGeluWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_gelu():
+        LIBINFINIOP.infiniopGelu(
+            descriptor, workspace.data(), workspace.size(), y.data(), x.data(), None
+        )
+
+    lib_gelu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: gelu(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gelu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyGeluDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/gelu_backward.py b/test/infiniop/gelu_backward.py
new file mode 100644
index 000000000..03b032541
--- /dev/null
+++ b/test/infiniop/gelu_backward.py
@@ -0,0 +1,200 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+import math
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+# _TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16]
+_TENSOR_DTYPES = [InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def gelu_backward(input, grad_output):
+    sqrt_2_over_pi = math.sqrt(2.0 / math.pi)
+    alpha = 0.044715
+
+    x = input
+    x_cubed = x * x * x
+    inner = sqrt_2_over_pi * (x + alpha * x_cubed)
+    tanh_inner = torch.tanh(inner)
+
+    # derivative of tanh part
+    sech_squared = 1 - tanh_inner * tanh_inner
+    left = 0.5 * (1 + tanh_inner)
+    right = 0.5 * x * sech_squared * sqrt_2_over_pi * (1 + 3 * alpha * x * x)
+
+    grad_input = grad_output * (left + right)
+    return grad_input
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+):
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    input = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+    grad_output_torch_tensor = torch.rand(shape) * 4 - 2
+    grad_output = TestTensor(
+        shape,
+        grad_output_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = input
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing gelu_backward on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = gelu_backward(input.torch_tensor(), grad_output.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGeluBackWardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            input.descriptor,
+            grad_output.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [y, input, grad_output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGeluBackWardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_gelu_backward():
+        LIBINFINIOP.infiniopGeluBackWard(
+            descriptor,
+            workspace.data(),
+            workspace.size(),
+            y.data(),
+            input.data(),
+            grad_output.data(),
+            None,
+        )
+
+    lib_gelu_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    # print(y.actual_tensor())
+    # print(ans)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: gelu_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gelu_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyGeluBackWardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/datatypes.py b/test/infiniop/libinfiniop/datatypes.py
index 633aaafa7..232895d9d 100644
--- a/test/infiniop/libinfiniop/datatypes.py
+++ b/test/infiniop/libinfiniop/datatypes.py
@@ -1,3 +1,6 @@
+from enum import IntEnum
+
+
 class InfiniDtype:
     INVALID = 0
     BYTE = 1
@@ -43,3 +46,9 @@ class InfiniDtype:
     InfiniDtype.C64: "C64",
     InfiniDtype.BF16: "BF16",
 }
+
+
+class InfiniDivMode(IntEnum):
+    DEFAULT = 0
+    TRUNC = 1
+    FLOOR = 2
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e92e77105..79df0e335 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -316,6 +316,105 @@ def relu_(lib):
     lib.infiniopDestroyReluDescriptor.argtypes = [infiniopOperatorDescriptor_t]
 
 
+@OpRegister.operator
+def relu_backward_(lib):
+    lib.infiniopCreateReluBackWardDescriptor.restype = c_int32
+    lib.infiniopCreateReluBackWardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopReluBackWard.restype = c_int32
+    lib.infiniopReluBackWard.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReluBackWardDescriptor.restype = c_int32
+    lib.infiniopDestroyReluBackWardDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def gelu_(lib):
+    lib.infiniopCreateGeluDescriptor.restype = c_int32
+    lib.infiniopCreateGeluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGelu.restype = c_int32
+    lib.infiniopGelu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGeluDescriptor.restype = c_int32
+    lib.infiniopDestroyGeluDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def gelu_backward_(lib):
+    lib.infiniopCreateGeluBackWardDescriptor.restype = c_int32
+    lib.infiniopCreateGeluBackWardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGeluBackWard.restype = c_int32
+    lib.infiniopGeluBackWard.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGeluBackWardDescriptor.restype = c_int32
+    lib.infiniopDestroyGeluBackWardDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def crossentropyloss_backward_(lib):
+    lib.infiniopCreateCrossEntropyLossBackWardDescriptor.restype = c_int32
+    lib.infiniopCreateCrossEntropyLossBackWardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCrossEntropyLossBackWardWorkspaceSize.restype = c_int32
+    lib.infiniopGetCrossEntropyLossBackWardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCrossEntropyLossBackWard.restype = c_int32
+    lib.infiniopCrossEntropyLossBackWard.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCrossEntropyLossBackWardDescriptor.restype = c_int32
+    lib.infiniopDestroyCrossEntropyLossBackWardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t
+    ]
+
+
 @OpRegister.operator
 def rms_norm_(lib):
     lib.infiniopCreateRMSNormDescriptor.restype = c_int32
@@ -454,6 +553,7 @@ def swiglu_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def conv_(lib):
     lib.infiniopCreateConvDescriptor.restype = c_int32
@@ -489,3 +589,154 @@ def conv_(lib):
     lib.infiniopDestroyConvDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+
+@OpRegister.operator
+def silu_(lib):
+    lib.infiniopCreateSiluDescriptor.restype = c_int32
+    lib.infiniopCreateSiluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSiluWorkspaceSize.restype = c_int32
+    lib.infiniopGetSiluWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSilu.restype = c_int32
+    lib.infiniopSilu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySiluDescriptor.restype = c_int32
+    lib.infiniopDestroySiluDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def logical_equal_(lib):
+    lib.infiniopCreateLogicalEqualDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalEqualDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLogicalEqualWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalEqualWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLogicalEqual.restype = c_int32
+    lib.infiniopLogicalEqual.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLogicalEqualDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalEqualDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def logical_or_(lib):
+    lib.infiniopCreateLogicalOrDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalOrDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLogicalOrWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalOrWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLogicalOr.restype = c_int32
+    lib.infiniopLogicalOr.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLogicalOrDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalOrDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def logical_and_(lib):
+    lib.infiniopCreateLogicalAndDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalAndDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLogicalAndWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalAndWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLogicalAnd.restype = c_int32
+    lib.infiniopLogicalAnd.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLogicalAndDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalAndDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def div_(lib):
+    lib.infiniopCreateDivDescriptor.restype = c_int32
+    lib.infiniopCreateDivDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetDivWorkspaceSize.restype = c_int32
+    lib.infiniopGetDivWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopDiv.restype = c_int32
+    lib.infiniopDiv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_int32,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyDivDescriptor.restype = c_int32
+    lib.infiniopDestroyDivDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index 5c8e7f80a..83d0eefaf 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -70,6 +70,22 @@ def __init__(
             self._torch_tensor = torch.rand(
                 torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
             )
+        elif mode == "bool":
+            self._torch_tensor = torch.randint(
+                0,
+                2,
+                torch_shape,
+                dtype=to_torch_dtype(dt),
+                device=torch_device_map[device],
+            )
+        elif mode == "int":
+            self._torch_tensor = torch.randint(
+                -50,
+                50,
+                torch_shape,
+                dtype=to_torch_dtype(dt),
+                device=torch_device_map[device],
+            )
         elif mode == "zeros":
             self._torch_tensor = torch.zeros(
                 torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
@@ -122,7 +138,9 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
 
 
 def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
-    if dt == InfiniDtype.I8:
+    if dt == InfiniDtype.BOOL:
+        return torch.bool
+    elif dt == InfiniDtype.I8:
         return torch.int8
     elif dt == InfiniDtype.I16:
         return torch.int16
@@ -330,6 +348,11 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
         actual = actual.to(torch.float32)
         desired = desired.to(torch.float32)
 
+    # 如果是Bool，全部转成01再比对
+    if actual.dtype == torch.bool or desired.dtype == torch.bool:
+        actual = actual.to(torch.int8)
+        desired = desired.to(torch.int8)
+
     print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose)
     np.testing.assert_allclose(
         actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True
diff --git a/test/infiniop/logical_and.py b/test/infiniop/logical_and.py
new file mode 100644
index 000000000..1b0437941
--- /dev/null
+++ b/test/infiniop/logical_and.py
@@ -0,0 +1,172 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.I8, InfiniDtype.BOOL]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def logical_and(c, a, b):
+    return torch.logical_and(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device, mode="bool")
+    b = TestTensor(shape, b_stride, dtype, device, mode="bool")
+
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode="bool")
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing LogicalAnd on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogicalAndDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogicalAndWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_logical_and():
+        check_error(
+            LIBINFINIOP.infiniopLogicalAnd(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_logical_and()
+
+    atol, rtol = 0, 0
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_logical_and(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLogicalAndDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/logical_equal.py b/test/infiniop/logical_equal.py
new file mode 100644
index 000000000..a185ea7bc
--- /dev/null
+++ b/test/infiniop/logical_equal.py
@@ -0,0 +1,183 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+# 三类type
+bool_types = [InfiniDtype.BOOL]
+float_types = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]
+int_types = [InfiniDtype.I8, InfiniDtype.I16, InfiniDtype.I32, InfiniDtype.I64]
+
+_TENSOR_DTYPES = bool_types + float_types + int_types
+mode_map = {
+    **{t: "bool" for t in bool_types},
+    **{t: "random" for t in float_types},
+    **{t: "int" for t in int_types},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def logical_equal(c, a, b):
+    return torch.eq(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    mode = mode_map.get(dtype)
+    a = TestTensor(shape, a_stride, dtype, device, mode=mode)
+    b = TestTensor(shape, b_stride, dtype, device, mode=mode)
+
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode=mode)
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing LogicalEqual on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    logical_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogicalEqualDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogicalEqualWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_logical_equal():
+        check_error(
+            LIBINFINIOP.infiniopLogicalEqual(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_logical_equal()
+
+    atol, rtol = 0, 0
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: logical_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_logical_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLogicalEqualDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/logical_or.py b/test/infiniop/logical_or.py
new file mode 100644
index 000000000..0a177aefc
--- /dev/null
+++ b/test/infiniop/logical_or.py
@@ -0,0 +1,172 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.I8, InfiniDtype.BOOL]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def logical_or(c, a, b):
+    return torch.logical_or(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device, mode="bool")
+    b = TestTensor(shape, b_stride, dtype, device, mode="bool")
+
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode="bool")
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing LogicalOr on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogicalOrDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogicalOrWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_logical_or():
+        check_error(
+            LIBINFINIOP.infiniopLogicalOr(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_logical_or()
+
+    atol, rtol = 0, 0
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_logical_or(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLogicalOrDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/relu_backward.py b/test/infiniop/relu_backward.py
new file mode 100644
index 000000000..58f6b8d13
--- /dev/null
+++ b/test/infiniop/relu_backward.py
@@ -0,0 +1,177 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_INPUT,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def relu_backward(input, grad_output):
+    dy = (input > 0).float().to(input.dtype)
+    return grad_output * dy
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=torch.float16, sync=None
+):
+    input_torch_tensor = torch.rand(shape) * 2 - 1
+    input = TestTensor(
+        shape,
+        input_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=input_torch_tensor,
+    )
+    grad_output_torch_tensor = torch.rand(shape) * 4 - 2
+    grad_output = TestTensor(
+        shape,
+        grad_output_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=input_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_INPUT:
+        grad_input = input
+    else:
+        grad_input = TestTensor(shape, None, dtype, device)
+    print(
+        f"Testing ReluBackWard on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = relu_backward(input.torch_tensor(), grad_output.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReluBackWardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_input.descriptor,
+            input.descriptor,
+            grad_output.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [grad_input, input, grad_output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReluBackWardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, input.device)
+
+    def lib_relu_backward():
+        LIBINFINIOP.infiniopReluBackWard(
+            descriptor,
+            workspace.data(),
+            workspace.size(),
+            grad_input.data(),
+            input.data(),
+            grad_output.data(),
+            None,
+        )
+
+    lib_relu_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(grad_input.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(grad_input.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: relu_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_relu_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReluBackWardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/silu.py b/test/infiniop/silu.py
new file mode 100644
index 000000000..7f971dec7
--- /dev/null
+++ b/test/infiniop/silu.py
@@ -0,0 +1,173 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    profile_operation,
+    TestWorkspace,
+    get_tolerance,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    # TODO: Uncomment the following line.
+    # ((),),
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-3},
+    InfiniDtype.F32: {"atol": 2e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def silu(x):
+    return x * torch.sigmoid(x)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Silu on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = silu(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSiluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+        )
+    )
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSiluWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_silu():
+        check_error(
+            LIBINFINIOP.infiniopSilu(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_silu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: silu(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_silu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySiluDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")