diff --git a/include/infiniop.h b/include/infiniop.h
index b3cf8b6ca..3710fe1e5 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -20,5 +20,6 @@
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/all_equal.h"
 
-#endif // __INFINIOP_API_H__
+#endif // __INFINIOP_API_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/all_equal.h b/include/infiniop/ops/all_equal.h
new file mode 100644
index 000000000..e22873bde
--- /dev/null
+++ b/include/infiniop/ops/all_equal.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_ALL_EQUAL_API_H__
+#define __INFINIOP_ALL_EQUAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAllEqualDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAllEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopAllEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc
+);
+
+__C __export infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAllEqual(
+    infiniopAllEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * c,
+    const void * a,
+    const void * b,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..ad10d2d73 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,7 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(all_equal)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -43,6 +44,7 @@ DECLARE_INFINIOP_TEST(sub)
         REGISTER_INFINIOP_TEST(causal_softmax) \
         REGISTER_INFINIOP_TEST(rearrange)      \
         REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(all_equal)      \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/all_equal.cpp b/src/infiniop-test/src/ops/all_equal.cpp
new file mode 100644
index 000000000..34b0be012
--- /dev/null
+++ b/src/infiniop-test/src/ops/all_equal.cpp
@@ -0,0 +1,110 @@
+#include "../../../include/infiniop/ops/all_equal.h"
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::all_equal {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopAllEqualDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateAllEqualDescriptor(handle, &op_desc,
+                                              c->desc(),
+                                              a->desc(),
+                                              b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetAllEqualWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopAllEqual(op_desc, workspace, workspace_size,
+                              c->data(),
+                              a->data(),
+                              b->data(),
+                              nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopAllEqual(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::all_equal
diff --git a/src/infiniop/ops/all_equal/all_equal.h b/src/infiniop/ops/all_equal/all_equal.h
new file mode 100644
index 000000000..3eb453cf1
--- /dev/null
+++ b/src/infiniop/ops/all_equal/all_equal.h
@@ -0,0 +1,46 @@
+#ifndef __ALL_EQUAL_H__
+#define __ALL_EQUAL_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::all_equal::NAMESPACE {                                 \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        op::all_equal::AllEqualInfo _info;                               \
+        size_t _workspace_size;                                          \
+        Descriptor(                                                      \
+            infiniDtype_t dtype,                                         \
+            op::all_equal::AllEqualInfo info,                            \
+            size_t workspace_size_,                                      \
+            Opaque *opaque,                                              \
+            infiniDevice_t device_type,                                  \
+            int device_id) : InfiniopDescriptor{device_type, device_id}, \
+                             _opaque(opaque),                            \
+                             _info(info),                                \
+                             _workspace_size(workspace_size_) {}         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+        size_t workspaceSize() const { return _workspace_size; }         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t c_desc,                           \
+            infiniopTensorDescriptor_t a_desc,                           \
+            infiniopTensorDescriptor_t b_desc);                          \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *c,                                                     \
+            const void *a,                                               \
+            const void *b,                                               \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc
new file mode 100644
index 000000000..8d151fff4
--- /dev/null
+++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc
@@ -0,0 +1,76 @@
+#include "all_equal_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::all_equal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = c_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL);
+    CHECK_OR_RETURN(b_desc->dtype() == a_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
+    size_t WorkSpaceSize = 0;
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = AllEqualInfo::createAllEqualInfo(
+        c_desc,
+        a_desc,
+        b_desc);
+    CHECK_RESULT(result);
+    const AllEqualInfo &info = result.take();
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) const {
+    std::vector<ptrdiff_t> contiguous_strides(_info.ndim);
+    ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for (size_t d = 0; d < _info.ndim; d++) {
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = _info.a_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+    size_t elem_size = infiniSizeOf(_info.dtype);
+    auto c_ptr = reinterpret_cast<bool *>(c);
+    *c_ptr = true;
+#pragma omp parallel for
+    for (int i = 0; i < static_cast<int>(total_size); i++) {
+        auto a_ptr = reinterpret_cast<const char *>(a);
+        auto b_ptr = reinterpret_cast<const char *>(b);
+        size_t rem = static_cast<size_t>(i);
+        for (int d = static_cast<int>(_info.ndim) - 1; d >= 0; d--) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            a_ptr += dim_index * _info.a_strides[d];
+            b_ptr += dim_index * _info.b_strides[d];
+        }
+        if (memcmp(a_ptr, b_ptr, elem_size) != 0) {
+            *c_ptr = false;
+        }
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::all_equal::cpu
diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h
new file mode 100644
index 000000000..5cc5f2fdd
--- /dev/null
+++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __ALL_EQUAL_CPU_H__
+#define __ALL_EQUAL_CPU_H__
+
+#include "../all_equal.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __ALL_EQUAL_CPU_H__
diff --git a/src/infiniop/ops/all_equal/cuda/kernel.cuh b/src/infiniop/ops/all_equal/cuda/kernel.cuh
new file mode 100644
index 000000000..048d33a81
--- /dev/null
+++ b/src/infiniop/ops/all_equal/cuda/kernel.cuh
@@ -0,0 +1,54 @@
+#ifndef __ALL_EQUAL_KERNEL_CUH__
+#define __ALL_EQUAL_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void allEqualKernel(
+    bool *c,
+    const Tdata *a,
+    const Tdata *b,
+    size_t ndim,
+    size_t total_size,
+    ptrdiff_t *contiguous_strides,
+    ptrdiff_t *a_strides,
+    ptrdiff_t *b_strides) {
+    // 使用共享内存来避免竞态条件
+    __shared__ bool block_result;
+
+    if (threadIdx.x == 0) {
+        block_result = true;
+    }
+    __syncthreads();
+
+    // 每个线程检查自己负责的元素
+    bool thread_result = true;
+    for (size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) {
+        auto a_ptr = a;
+        auto b_ptr = b;
+        size_t rem = i;
+        for (int d = ndim - 1; d >= 0; d--) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            a_ptr += dim_index * a_strides[d];
+            b_ptr += dim_index * b_strides[d];
+        }
+        if (*a_ptr != *b_ptr) {
+            thread_result = false;
+            break; // 发现不匹配，提前退出
+        }
+    }
+
+    // 使用原子操作来安全地更新结果
+    if (!thread_result) {
+        atomicAnd((int *)&block_result, 0);
+    }
+
+    __syncthreads();
+
+    // 只有第一个线程写入最终结果
+    if (threadIdx.x == 0) {
+        *c = block_result;
+    }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __ALL_EQUAL_KERNEL_CUH__
diff --git a/src/infiniop/ops/all_equal/info.h b/src/infiniop/ops/all_equal/info.h
new file mode 100644
index 000000000..71cbb1bb6
--- /dev/null
+++ b/src/infiniop/ops/all_equal/info.h
@@ -0,0 +1,45 @@
+#ifndef __EQUAL_INFO_H__
+#define __EQUAL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::all_equal {
+
+class AllEqualInfo {
+private:
+    AllEqualInfo() = default;
+
+public:
+    //  ---------------------------- start: define member variables of Info ----------------------------
+    size_t ndim;
+    infiniDtype_t dtype;
+    std::vector<size_t> a_shape;
+    std::vector<ptrdiff_t> a_strides;
+    std::vector<ptrdiff_t> b_strides;
+
+    //  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<AllEqualInfo> createAllEqualInfo(
+        infiniopTensorDescriptor_t c_desc,
+        infiniopTensorDescriptor_t a_desc,
+        infiniopTensorDescriptor_t b_desc) {
+        //  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_OR_RETURN(c_desc->ndim() == 1 && c_desc->dim(0) == 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_SAME_SHAPE(a_desc->shape(), b_desc->shape());
+        //  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<AllEqualInfo>(AllEqualInfo{
+            //  ------------------------------ start: create an instance of Info -------------------------------
+            a_desc->ndim(),
+            a_desc->dtype(),
+            a_desc->shape(),
+            a_desc->strides(),
+            b_desc->strides()
+            //  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+} // namespace op::all_equal
+
+#endif //  __EQUAL_INFO_H__
diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.h b/src/infiniop/ops/all_equal/metax/all_equal_metax.h
new file mode 100644
index 000000000..fbb9ef70a
--- /dev/null
+++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ALL_EQUAL_METAX_H__
+#define __ALL_EQUAL_METAX_H__
+
+#include "../all_equal.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ALL_EQUAL_METAX_H__
diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.maca b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca
new file mode 100644
index 000000000..df50ffbee
--- /dev/null
+++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca
@@ -0,0 +1,162 @@
+#include "../../../devices/metax/metax_common.h"
+#include "all_equal_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::all_equal::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    bool * c,
+    const Tdata * a,
+    const Tdata * b,
+    size_t ndim,
+    size_t total_size,
+    ptrdiff_t* contiguous_strides,
+    ptrdiff_t* a_strides,
+    ptrdiff_t* b_strides
+) {
+    allEqualKernel<BLOCK_SIZE, Tdata>(
+        c,
+        a,
+        b,
+        ndim,
+        total_size,
+        contiguous_strides,
+        a_strides,
+        b_strides
+    );
+}
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_all_equal(
+    const AllEqualInfo &info,
+    bool * c,
+    const Tdata * a,
+    const Tdata * b,
+    hcStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.a_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim;
+
+    CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+
+    launchKernel<BLOCK_SIZE, Tdata><<<1, BLOCK_SIZE, 0, stream>>>(
+        c,
+        a,
+        b,
+        info.ndim,
+        total_size,
+        contiguous_strides_cuda,
+        a_strides_cuda,
+        b_strides_cuda
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = a_desc->dtype();
+    auto result = AllEqualInfo::createAllEqualInfo(
+        c_desc,
+        a_desc,
+        b_desc
+    );
+    CHECK_RESULT(result);
+    const AllEqualInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3;;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * c,
+    const void * a,
+    const void * b,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_EQUAL(TDATA) \
+        calculate_all_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
+    switch (_info.dtype) {                   
+        case INFINI_DTYPE_U8:                       
+            return CALCULATE_EQUAL(uint8_t);  
+        case INFINI_DTYPE_U16:                      
+            return CALCULATE_EQUAL(uint16_t); 
+        case INFINI_DTYPE_U32:                      
+            return CALCULATE_EQUAL(uint32_t); 
+        case INFINI_DTYPE_U64:                      
+            return CALCULATE_EQUAL(uint64_t); 
+        case INFINI_DTYPE_I8:                       
+            return CALCULATE_EQUAL(int8_t);   
+        case INFINI_DTYPE_I16:                      
+            return CALCULATE_EQUAL(int16_t);  
+        case INFINI_DTYPE_I32:                      
+            return CALCULATE_EQUAL(int32_t);
+        case INFINI_DTYPE_I64:
+            return CALCULATE_EQUAL(int64_t);
+        case INFINI_DTYPE_F16:
+            return CALCULATE_EQUAL(half);
+        case INFINI_DTYPE_F32:
+            return CALCULATE_EQUAL(float);     
+        case INFINI_DTYPE_BF16:
+            return CALCULATE_EQUAL(cuda_bfloat16);         
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_EQUAL
+}
+} // namespace op::all_equal::metax
diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu
new file mode 100644
index 000000000..4d8331e80
--- /dev/null
+++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu
@@ -0,0 +1,151 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+#include "all_equal_nvidia.cuh"
+
+namespace op::all_equal::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    bool *c,
+    const Tdata *a,
+    const Tdata *b,
+    size_t ndim,
+    size_t total_size,
+    ptrdiff_t *contiguous_strides,
+    ptrdiff_t *a_strides,
+    ptrdiff_t *b_strides) {
+    allEqualKernel<BLOCK_SIZE, Tdata>(
+        c,
+        a,
+        b,
+        ndim,
+        total_size,
+        contiguous_strides,
+        a_strides,
+        b_strides);
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_all_equal(
+    const AllEqualInfo &info,
+    bool *c,
+    const Tdata *a,
+    const Tdata *b,
+    cudaStream_t stream,
+    void *workspace) {
+    size_t ndim = info.ndim;
+    ptrdiff_t *contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    for (size_t d = 0; d < ndim; d++) {
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.a_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+
+    ptrdiff_t *contiguous_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace);
+    ptrdiff_t *a_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t *b_strides_cuda = a_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+
+    launchKernel<BLOCK_SIZE, Tdata><<<1, BLOCK_SIZE, 0, stream>>>(
+        c,
+        a,
+        b,
+        info.ndim,
+        total_size,
+        contiguous_strides_cuda,
+        a_strides_cuda,
+        b_strides_cuda);
+
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = a_desc->dtype();
+    auto result = AllEqualInfo::createAllEqualInfo(
+        c_desc,
+        a_desc,
+        b_desc);
+    CHECK_RESULT(result);
+    const AllEqualInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3;
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream_) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+#define CALCULATE_EQUAL(TDATA) \
+    calculate_all_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace)
+    switch (_info.dtype) {
+    case INFINI_DTYPE_U8:
+        return CALCULATE_EQUAL(uint8_t);
+    case INFINI_DTYPE_U16:
+        return CALCULATE_EQUAL(uint16_t);
+    case INFINI_DTYPE_U32:
+        return CALCULATE_EQUAL(uint32_t);
+    case INFINI_DTYPE_U64:
+        return CALCULATE_EQUAL(uint64_t);
+    case INFINI_DTYPE_I8:
+        return CALCULATE_EQUAL(int8_t);
+    case INFINI_DTYPE_I16:
+        return CALCULATE_EQUAL(int16_t);
+    case INFINI_DTYPE_I32:
+        return CALCULATE_EQUAL(int32_t);
+    case INFINI_DTYPE_I64:
+        return CALCULATE_EQUAL(int64_t);
+    case INFINI_DTYPE_F16:
+        return CALCULATE_EQUAL(half);
+    case INFINI_DTYPE_F32:
+        return CALCULATE_EQUAL(float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE_EQUAL(cuda_bfloat16);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+
+#undef CALCULATE_EQUAL
+}
+} // namespace op::all_equal::nvidia
diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh
new file mode 100644
index 000000000..c459e8def
--- /dev/null
+++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __ALL_EQUAL_NVIDIA_API_H__
+#define __ALL_EQUAL_NVIDIA_API_H__
+#include "../all_equal.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __ALL_EQUAL_NVIDIA_API_H__
diff --git a/src/infiniop/ops/all_equal/operator.cc b/src/infiniop/ops/all_equal/operator.cc
new file mode 100644
index 000000000..f89e565a7
--- /dev/null
+++ b/src/infiniop/ops/all_equal/operator.cc
@@ -0,0 +1,148 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/all_equal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/all_equal_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/all_equal_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/all_equal_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAllEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopAllEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::all_equal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::all_equal::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                              \
+            a_desc,                                                              \
+            b_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::all_equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAllEqual(
+    infiniopAllEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                              \
+    case CASE:                                                                                  \
+        return reinterpret_cast<const op::all_equal::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace,                                                                          \
+            workspace_size,                                                                     \
+            c,                                                                                  \
+            a,                                                                                  \
+            b,                                                                                  \
+            stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::all_equal::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/all_equal.py b/test/infiniop/all_equal.py
new file mode 100644
index 000000000..adcf3d366
--- /dev/null
+++ b/test/infiniop/all_equal.py
@@ -0,0 +1,220 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride
+    ((13, 4), None, None),
+    ((13, 4), (13, 1), (13, 1)),
+    ((13, 4, 4), (16, 4, 1), (16, 4, 1),),
+    ((16, 5632), None, None),
+]
+
+class Identical(Enum):
+    EQUAL = auto()
+    NOT_EQUAL = auto()
+
+
+_IDENTICAL = [
+    Identical.EQUAL, # -> result=true
+    Identical.NOT_EQUAL, # -> result=false
+]
+
+_TEST_CASES = [
+    test_case + (identical_item,)
+    for test_case in _TEST_CASES_
+    for identical_item in _IDENTICAL
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+}
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_equal(c, a, b):
+    return torch.tensor(torch.equal(input=a, other=b), dtype=torch.bool)
+    
+
+def test(
+    handle,
+    device,
+    input_shape,
+    a_strides,
+    b_strides,
+    identical,
+    dtype,
+    sync=None,
+):
+    torch_dtype = {
+        InfiniDtype.F16: torch.half,
+        InfiniDtype.F32: torch.float,
+        InfiniDtype.BF16: torch.bfloat16,
+        InfiniDtype.I32: torch.int32,
+        InfiniDtype.I64: torch.int64
+    }[dtype]
+    is_integer_dtype = torch_dtype in (torch.int32, torch.int64)
+
+    print(
+        f"Testing equal on {InfiniDeviceNames[device]} with input_shape:{input_shape},"
+        f"a_stride:{a_strides} b_stride:{b_strides} identical:{identical},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+    torch_c = torch.tensor([False], dtype=torch.bool)
+    c = TestTensor(
+        [1],
+        torch_c.stride(),
+        InfiniDtype.BOOL,
+        device,
+        "manual",
+        set_tensor=torch_c
+    )
+
+    if a_strides is None:
+        torch_a = (torch.rand(input_shape) * 100 - 50).type(torch_dtype)
+    else:
+        # Allocate storage that can support the requested strides
+        torch_a = torch.empty_strided(input_shape, a_strides, dtype=torch_dtype)
+        if is_integer_dtype:
+            tmp_a = torch.randint(-50, 50, input_shape, dtype=torch_dtype)
+            torch_a.copy_(tmp_a)
+        else:
+            torch_a.uniform_(-50, 50)
+    a = TestTensor(
+        input_shape,
+        torch_a.stride(),
+        dtype,
+        device,
+        "manual",
+        set_tensor=torch_a        
+    )
+    if identical == Identical.EQUAL:
+        if b_strides is None:
+            torch_b = torch_a.clone()
+        else:
+            # Create b with desired strides and copy values from a to ensure equality
+            torch_b = torch.empty_strided(input_shape, b_strides, dtype=torch_dtype)
+            torch_b.copy_((torch_a))
+    else:
+        if b_strides is None:
+            torch_b = (torch.rand(input_shape) * 100 - 50).type(torch_dtype)
+        else:
+            torch_b = torch.empty_strided(input_shape, b_strides, dtype=torch_dtype)
+            if is_integer_dtype:
+                tmp_b = torch.randint(-50, 50, input_shape, dtype=torch_dtype)
+                torch_b.copy_(tmp_b)
+            else:
+                torch_b.uniform_(-50, 50)
+ 
+    b = TestTensor(
+        input_shape,
+        torch_b.stride(),
+        dtype,
+        device,
+        "manual",
+        set_tensor=torch_b
+    )
+
+
+    c._torch_tensor = torch_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAllEqualDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [c, a, b]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAllEqualWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_equal():
+        check_error(
+            LIBINFINIOP.infiniopAllEqual(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),                
+                None,
+            )
+        )
+
+    lib_equal()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor().to(torch.uint8), c.torch_tensor().to(torch.uint8), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_equal(
+            c.torch_tensor(), a.torch_tensor(), b.torch_tensor()
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyAllEqualDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my all_equal passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index ba1ce33df..a7ca1a113 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -583,3 +583,32 @@ def softplus_(lib):
     ]
     lib.infiniopDestroySoftplusDescriptor.restype = c_int32
     lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def all_equal_(lib):
+    lib.infiniopCreateAllEqualDescriptor.restype = c_int32
+    lib.infiniopCreateAllEqualDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAllEqualWorkspaceSize.restype = c_int32
+    lib.infiniopGetAllEqualWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopAllEqual.restype = c_int32
+    lib.infiniopAllEqual.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAllEqualDescriptor.restype = c_int32
+    lib.infiniopDestroyAllEqualDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index 162b199fe..8c7570d41 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -139,6 +139,8 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
 
 
 def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
+    if dt == InfiniDtype.BOOL: # support torch.bool input dtype
+        return torch.bool       
     if dt == InfiniDtype.I8:
         return torch.int8
     elif dt == InfiniDtype.I16:
@@ -269,7 +271,7 @@ def rearrange_tensor(tensor, new_strides):
     new_positions += offset
 
     # Copy the original data to the new tensor
-    new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
+    new_tensor.view(-1).index_add_(0, new_positions, tensor.reshape(-1))
     new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))
 
     return new_tensor