InfiniTensor · gongchensu · Sep 17, 2025 · Sep 29, 2025 · Sep 30, 2025 · Sep 30, 2025
diff --git a/include/infiniop.h b/include/infiniop.h
@@ -20,5 +20,6 @@
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/all_equal.h"
 
-#endif // __INFINIOP_API_H__
+#endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/all_equal.h b/include/infiniop/ops/all_equal.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_ALL_EQUAL_API_H__
+#define __INFINIOP_ALL_EQUAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAllEqualDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAllEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopAllEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc
+);
+
+__C __export infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAllEqual(
+    infiniopAllEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * c,
+    const void * a,
+    const void * b,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,7 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(all_equal)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -43,6 +44,7 @@ DECLARE_INFINIOP_TEST(sub)
         REGISTER_INFINIOP_TEST(causal_softmax) \
         REGISTER_INFINIOP_TEST(rearrange)      \
         REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(all_equal)      \
     }
 
 namespace infiniop_test {

diff --git a/src/infiniop-test/src/ops/all_equal.cpp b/src/infiniop-test/src/ops/all_equal.cpp
@@ -0,0 +1,110 @@
+#include "../../../include/infiniop/ops/all_equal.h"
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::all_equal {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopAllEqualDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateAllEqualDescriptor(handle, &op_desc,
+                                              c->desc(),
+                                              a->desc(),
+                                              b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetAllEqualWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopAllEqual(op_desc, workspace, workspace_size,
+                              c->data(),
+                              a->data(),
+                              b->data(),
+                              nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopAllEqual(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::all_equal
diff --git a/src/infiniop/ops/all_equal/all_equal.h b/src/infiniop/ops/all_equal/all_equal.h
@@ -0,0 +1,46 @@
+#ifndef __ALL_EQUAL_H__
+#define __ALL_EQUAL_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::all_equal::NAMESPACE {                                 \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        op::all_equal::AllEqualInfo _info;                               \
+        size_t _workspace_size;                                          \
+        Descriptor(                                                      \
+            infiniDtype_t dtype,                                         \
+            op::all_equal::AllEqualInfo info,                            \
+            size_t workspace_size_,                                      \
+            Opaque *opaque,                                              \
+            infiniDevice_t device_type,                                  \
+            int device_id) : InfiniopDescriptor{device_type, device_id}, \
+                             _opaque(opaque),                            \
+                             _info(info),                                \
+                             _workspace_size(workspace_size_) {}         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+        size_t workspaceSize() const { return _workspace_size; }         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t c_desc,                           \
+            infiniopTensorDescriptor_t a_desc,                           \
+            infiniopTensorDescriptor_t b_desc);                          \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *c,                                                     \
+            const void *a,                                               \
+            const void *b,                                               \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif
diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc
@@ -0,0 +1,76 @@
+#include "all_equal_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::all_equal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    //  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = c_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL);
+    CHECK_OR_RETURN(b_desc->dtype() == a_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
+    size_t WorkSpaceSize = 0;
+    //  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = AllEqualInfo::createAllEqualInfo(
+        c_desc,
+        a_desc,
+        b_desc);
+    CHECK_RESULT(result);
+    const AllEqualInfo &info = result.take();
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) const {
+    std::vector<ptrdiff_t> contiguous_strides(_info.ndim);
+    ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for (size_t d = 0; d < _info.ndim; d++) {
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = _info.a_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+    size_t elem_size = infiniSizeOf(_info.dtype);
+    auto c_ptr = reinterpret_cast<bool *>(c);
+    *c_ptr = true;
+#pragma omp parallel for
+    for (int i = 0; i < static_cast<int>(total_size); i++) {
+        auto a_ptr = reinterpret_cast<const char *>(a);
+        auto b_ptr = reinterpret_cast<const char *>(b);
+        size_t rem = static_cast<size_t>(i);
+        for (int d = static_cast<int>(_info.ndim) - 1; d >= 0; d--) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            a_ptr += dim_index * _info.a_strides[d];
+            b_ptr += dim_index * _info.b_strides[d];
+        }
+        if (memcmp(a_ptr, b_ptr, elem_size) != 0) {
+            *c_ptr = false;
+        }
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::all_equal::cpu
diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __ALL_EQUAL_CPU_H__
+#define __ALL_EQUAL_CPU_H__
+
+#include "../all_equal.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __ALL_EQUAL_CPU_H__
diff --git a/src/infiniop/ops/all_equal/cuda/kernel.cuh b/src/infiniop/ops/all_equal/cuda/kernel.cuh
@@ -0,0 +1,54 @@
+#ifndef __ALL_EQUAL_KERNEL_CUH__
+#define __ALL_EQUAL_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void allEqualKernel(
+    bool *c,
+    const Tdata *a,
+    const Tdata *b,
+    size_t ndim,
+    size_t total_size,
+    ptrdiff_t *contiguous_strides,
+    ptrdiff_t *a_strides,
+    ptrdiff_t *b_strides) {
+    // 使用共享内存来避免竞态条件
+    __shared__ bool block_result;
+
+    if (threadIdx.x == 0) {
+        block_result = true;
+    }
+    __syncthreads();
+
+    // 每个线程检查自己负责的元素
+    bool thread_result = true;
+    for (size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) {
+        auto a_ptr = a;
+        auto b_ptr = b;
+        size_t rem = i;
+        for (int d = ndim - 1; d >= 0; d--) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            a_ptr += dim_index * a_strides[d];
+            b_ptr += dim_index * b_strides[d];
+        }
+        if (*a_ptr != *b_ptr) {
+            thread_result = false;
+            break; // 发现不匹配，提前退出
+        }
+    }
+
+    // 使用原子操作来安全地更新结果
+    if (!thread_result) {
+        atomicAnd((int *)&block_result, 0);
+    }
+
+    __syncthreads();
+
+    // 只有第一个线程写入最终结果
+    if (threadIdx.x == 0) {
+        *c = block_result;
+    }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __ALL_EQUAL_KERNEL_CUH__