Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@
#include "infiniop/ops/swiglu.h"
#include "infiniop/ops/topkrouter.h"
#include "infiniop/tensor_descriptor.h"
#include "infiniop/ops/all_equal.h"

#endif // __INFINIOP_API_H__
#endif // __INFINIOP_API_H__
30 changes: 30 additions & 0 deletions include/infiniop/ops/all_equal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#ifndef __INFINIOP_ALL_EQUAL_API_H__
#define __INFINIOP_ALL_EQUAL_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopAllEqualDescriptor_t;

__C __export infiniStatus_t infiniopCreateAllEqualDescriptor(
infiniopHandle_t handle,
infiniopAllEqualDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc
);

__C __export infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopAllEqual(
infiniopAllEqualDescriptor_t desc,
void *workspace,
size_t workspace_size,
void * c,
const void * a,
const void * b,
void *stream
);

__C __export infiniStatus_t infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc);

#endif
2 changes: 2 additions & 0 deletions src/infiniop-test/include/ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ DECLARE_INFINIOP_TEST(add)
DECLARE_INFINIOP_TEST(causal_softmax)
DECLARE_INFINIOP_TEST(rearrange)
DECLARE_INFINIOP_TEST(sub)
DECLARE_INFINIOP_TEST(all_equal)

#define REGISTER_INFINIOP_TEST(name) \
{ \
Expand Down Expand Up @@ -43,6 +44,7 @@ DECLARE_INFINIOP_TEST(sub)
REGISTER_INFINIOP_TEST(causal_softmax) \
REGISTER_INFINIOP_TEST(rearrange) \
REGISTER_INFINIOP_TEST(sub) \
REGISTER_INFINIOP_TEST(all_equal) \
}

namespace infiniop_test {
Expand Down
110 changes: 110 additions & 0 deletions src/infiniop-test/src/ops/all_equal.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include "../../../include/infiniop/ops/all_equal.h"
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>

namespace infiniop_test::all_equal {
struct Test::Attributes {
std::shared_ptr<Tensor> a;
std::shared_ptr<Tensor> b;
std::shared_ptr<Tensor> c;
std::shared_ptr<Tensor> ans;
};

std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("a") == tensors.end()
|| tensors.find("b") == tensors.end()
|| tensors.find("c") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}

test->_attributes->a = tensors["a"];
test->_attributes->b = tensors["b"];
test->_attributes->c = tensors["c"];
test->_attributes->ans = tensors["ans"];

return test;
}

std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopAllEqualDescriptor_t op_desc;
auto a = _attributes->a->to(device, device_id);
auto b = _attributes->b->to(device, device_id);
auto c = _attributes->c->to(device, device_id);
CHECK_OR(infiniopCreateAllEqualDescriptor(handle, &op_desc,
c->desc(),
a->desc(),
b->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetAllEqualWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopAllEqual(op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));

try {
allClose(c, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}

double elapsed_time = 0.;

elapsed_time = benchmark(
[=]() {
infiniopAllEqual(
op_desc, workspace, workspace_size,
c->data(),
a->data(),
b->data(),
nullptr);
},
warm_ups, iterations);

return TEST_PASSED(elapsed_time);
}

std::vector<std::string> Test::attribute_names() {
return {};
}

std::vector<std::string> Test::tensor_names() {
return {"a", "b", "c", "ans"};
}

std::vector<std::string> Test::output_names() {
return {"c"};
}

std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- a: " << _attributes->a->info() << std::endl;
oss << "- b: " << _attributes->b->info() << std::endl;
oss << "- c: " << _attributes->c->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}

Test::~Test() {
delete _attributes;
}

} // namespace infiniop_test::all_equal
46 changes: 46 additions & 0 deletions src/infiniop/ops/all_equal/all_equal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef __ALL_EQUAL_H__
#define __ALL_EQUAL_H__

#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"

#define DESCRIPTOR(NAMESPACE) \
namespace op::all_equal::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
op::all_equal::AllEqualInfo _info; \
size_t _workspace_size; \
Descriptor( \
infiniDtype_t dtype, \
op::all_equal::AllEqualInfo info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) : InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t c_desc, \
infiniopTensorDescriptor_t a_desc, \
infiniopTensorDescriptor_t b_desc); \
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *c, \
const void *a, \
const void *b, \
void *stream) const; \
}; \
}

#endif
76 changes: 76 additions & 0 deletions src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "all_equal_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
#include "../info.h"

namespace op::all_equal::cpu {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);

// --------------------- start: check data type and calculate workspace size ----------------------
auto dtype = c_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL);
CHECK_OR_RETURN(b_desc->dtype() == a_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
size_t WorkSpaceSize = 0;
// ---------------------- end: check data type and calculate workspace size -----------------------

auto result = AllEqualInfo::createAllEqualInfo(
c_desc,
a_desc,
b_desc);
CHECK_RESULT(result);
const AllEqualInfo &info = result.take();

*desc_ptr = new Descriptor(
dtype, std::move(info), WorkSpaceSize,
nullptr,
handle->device, handle->device_id);

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream) const {
std::vector<ptrdiff_t> contiguous_strides(_info.ndim);
ptrdiff_t last_dim = 1;
ptrdiff_t last_stride = 1;
for (size_t d = 0; d < _info.ndim; d++) {
contiguous_strides[d] = last_dim * last_stride;
last_dim = _info.a_shape[d];
last_stride = contiguous_strides[d];
}
size_t total_size = last_dim * last_stride;
size_t elem_size = infiniSizeOf(_info.dtype);
auto c_ptr = reinterpret_cast<bool *>(c);
*c_ptr = true;
#pragma omp parallel for
for (int i = 0; i < static_cast<int>(total_size); i++) {
auto a_ptr = reinterpret_cast<const char *>(a);
auto b_ptr = reinterpret_cast<const char *>(b);
size_t rem = static_cast<size_t>(i);
for (int d = static_cast<int>(_info.ndim) - 1; d >= 0; d--) {
size_t dim_index = rem / contiguous_strides[d];
rem = rem % contiguous_strides[d];
a_ptr += dim_index * _info.a_strides[d];
b_ptr += dim_index * _info.b_strides[d];
}
if (memcmp(a_ptr, b_ptr, elem_size) != 0) {
*c_ptr = false;
}
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all_equal::cpu
8 changes: 8 additions & 0 deletions src/infiniop/ops/all_equal/cpu/all_equal_cpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __ALL_EQUAL_CPU_H__
#define __ALL_EQUAL_CPU_H__

#include "../all_equal.h"

DESCRIPTOR(cpu)

#endif // __ALL_EQUAL_CPU_H__
54 changes: 54 additions & 0 deletions src/infiniop/ops/all_equal/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef __ALL_EQUAL_KERNEL_CUH__
#define __ALL_EQUAL_KERNEL_CUH__
// ------------------------------- start: perform operator on CUDA --------------------------------
template <unsigned int BLOCK_SIZE, typename Tdata>
__device__ void allEqualKernel(
bool *c,
const Tdata *a,
const Tdata *b,
size_t ndim,
size_t total_size,
ptrdiff_t *contiguous_strides,
ptrdiff_t *a_strides,
ptrdiff_t *b_strides) {
// 使用共享内存来避免竞态条件
__shared__ bool block_result;

if (threadIdx.x == 0) {
block_result = true;
}
__syncthreads();

// 每个线程检查自己负责的元素
bool thread_result = true;
for (size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) {
auto a_ptr = a;
auto b_ptr = b;
size_t rem = i;
for (int d = ndim - 1; d >= 0; d--) {
size_t dim_index = rem / contiguous_strides[d];
rem = rem % contiguous_strides[d];
a_ptr += dim_index * a_strides[d];
b_ptr += dim_index * b_strides[d];
}
if (*a_ptr != *b_ptr) {
thread_result = false;
break; // 发现不匹配,提前退出
}
}

// 使用原子操作来安全地更新结果
if (!thread_result) {
atomicAnd((int *)&block_result, 0);
}

__syncthreads();

// 只有第一个线程写入最终结果
if (threadIdx.x == 0) {
*c = block_result;
}
}
// -------------------------------- end: perform operator on CUDA ---------------------------------

#endif // __ALL_EQUAL_KERNEL_CUH__
Loading