diff --git a/include/infiniop.h b/include/infiniop.h index b3cf8b6ca..3710fe1e5 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -20,5 +20,6 @@ #include "infiniop/ops/swiglu.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/all_equal.h" -#endif // __INFINIOP_API_H__ +#endif // __INFINIOP_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/all_equal.h b/include/infiniop/ops/all_equal.h new file mode 100644 index 000000000..e22873bde --- /dev/null +++ b/include/infiniop/ops/all_equal.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_ALL_EQUAL_API_H__ +#define __INFINIOP_ALL_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAllEqualDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAllEqualDescriptor( + infiniopHandle_t handle, + infiniopAllEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +); + +__C __export infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAllEqual( + infiniopAllEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..ad10d2d73 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,7 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(all_equal) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -43,6 +44,7 @@ DECLARE_INFINIOP_TEST(sub) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(all_equal) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/all_equal.cpp b/src/infiniop-test/src/ops/all_equal.cpp new file mode 100644 index 000000000..34b0be012 --- /dev/null +++ b/src/infiniop-test/src/ops/all_equal.cpp @@ -0,0 +1,110 @@ +#include "../../../include/infiniop/ops/all_equal.h" +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::all_equal { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopAllEqualDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateAllEqualDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetAllEqualWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopAllEqual(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopAllEqual( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::all_equal diff --git a/src/infiniop/ops/all_equal/all_equal.h b/src/infiniop/ops/all_equal/all_equal.h new file mode 100644 index 000000000..3eb453cf1 --- /dev/null +++ b/src/infiniop/ops/all_equal/all_equal.h @@ -0,0 +1,46 @@ +#ifndef __ALL_EQUAL_H__ +#define __ALL_EQUAL_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::all_equal::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + op::all_equal::AllEqualInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + op::all_equal::AllEqualInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t c_desc, \ + infiniopTensorDescriptor_t a_desc, \ + infiniopTensorDescriptor_t b_desc); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *c, \ + const void *a, \ + const void *b, \ + void *stream) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc new file mode 100644 index 000000000..8d151fff4 --- /dev/null +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc @@ -0,0 +1,76 @@ +#include "all_equal_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::all_equal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + auto handle = reinterpret_cast(handle_); + + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = c_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL); + CHECK_OR_RETURN(b_desc->dtype() == a_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); + size_t WorkSpaceSize = 0; + // ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = AllEqualInfo::createAllEqualInfo( + c_desc, + a_desc, + b_desc); + CHECK_RESULT(result); + const AllEqualInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) const { + std::vector contiguous_strides(_info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for (size_t d = 0; d < _info.ndim; d++) { + contiguous_strides[d] = last_dim * last_stride; + last_dim = _info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + size_t elem_size = infiniSizeOf(_info.dtype); + auto c_ptr = reinterpret_cast(c); + *c_ptr = true; +#pragma omp parallel for + for (int i = 0; i < static_cast(total_size); i++) { + auto a_ptr = reinterpret_cast(a); + auto b_ptr = reinterpret_cast(b); + size_t rem = static_cast(i); + for (int d = static_cast(_info.ndim) - 1; d >= 0; d--) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + a_ptr += dim_index * _info.a_strides[d]; + b_ptr += dim_index * _info.b_strides[d]; + } + if (memcmp(a_ptr, b_ptr, elem_size) != 0) { + *c_ptr = false; + } + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::all_equal::cpu diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h new file mode 100644 index 000000000..5cc5f2fdd --- /dev/null +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h @@ -0,0 +1,8 @@ +#ifndef __ALL_EQUAL_CPU_H__ +#define __ALL_EQUAL_CPU_H__ + +#include "../all_equal.h" + +DESCRIPTOR(cpu) + +#endif // __ALL_EQUAL_CPU_H__ diff --git a/src/infiniop/ops/all_equal/cuda/kernel.cuh b/src/infiniop/ops/all_equal/cuda/kernel.cuh new file mode 100644 index 000000000..048d33a81 --- /dev/null +++ b/src/infiniop/ops/all_equal/cuda/kernel.cuh @@ -0,0 +1,54 @@ +#ifndef __ALL_EQUAL_KERNEL_CUH__ +#define __ALL_EQUAL_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void allEqualKernel( + bool *c, + const Tdata *a, + const Tdata *b, + size_t ndim, + size_t total_size, + ptrdiff_t *contiguous_strides, + ptrdiff_t *a_strides, + ptrdiff_t *b_strides) { + // 使用共享内存来避免竞态条件 + __shared__ bool block_result; + + if (threadIdx.x == 0) { + block_result = true; + } + __syncthreads(); + + // 每个线程检查自己负责的元素 + bool thread_result = true; + for (size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) { + auto a_ptr = a; + auto b_ptr = b; + size_t rem = i; + for (int d = ndim - 1; d >= 0; d--) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + a_ptr += dim_index * a_strides[d]; + b_ptr += dim_index * b_strides[d]; + } + if (*a_ptr != *b_ptr) { + thread_result = false; + break; // 发现不匹配,提前退出 + } + } + + // 使用原子操作来安全地更新结果 + if (!thread_result) { + atomicAnd((int *)&block_result, 0); + } + + __syncthreads(); + + // 只有第一个线程写入最终结果 + if (threadIdx.x == 0) { + *c = block_result; + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __ALL_EQUAL_KERNEL_CUH__ diff --git a/src/infiniop/ops/all_equal/info.h b/src/infiniop/ops/all_equal/info.h new file mode 100644 index 000000000..71cbb1bb6 --- /dev/null +++ b/src/infiniop/ops/all_equal/info.h @@ -0,0 +1,45 @@ +#ifndef __EQUAL_INFO_H__ +#define __EQUAL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::all_equal { + +class AllEqualInfo { +private: + AllEqualInfo() = default; + +public: + // ---------------------------- start: define member variables of Info ---------------------------- + size_t ndim; + infiniDtype_t dtype; + std::vector a_shape; + std::vector a_strides; + std::vector b_strides; + + // ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createAllEqualInfo( + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN(c_desc->ndim() == 1 && c_desc->dim(0) == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_SAME_SHAPE(a_desc->shape(), b_desc->shape()); + // -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(AllEqualInfo{ + // ------------------------------ start: create an instance of Info ------------------------------- + a_desc->ndim(), + a_desc->dtype(), + a_desc->shape(), + a_desc->strides(), + b_desc->strides() + // ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} // namespace op::all_equal + +#endif // __EQUAL_INFO_H__ diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.h b/src/infiniop/ops/all_equal/metax/all_equal_metax.h new file mode 100644 index 000000000..fbb9ef70a --- /dev/null +++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __ALL_EQUAL_METAX_H__ +#define __ALL_EQUAL_METAX_H__ + +#include "../all_equal.h" + +DESCRIPTOR(metax) + +#endif // __ALL_EQUAL_METAX_H__ diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.maca b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca new file mode 100644 index 000000000..df50ffbee --- /dev/null +++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca @@ -0,0 +1,162 @@ +#include "../../../devices/metax/metax_common.h" +#include "all_equal_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::all_equal::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + bool * c, + const Tdata * a, + const Tdata * b, + size_t ndim, + size_t total_size, + ptrdiff_t* contiguous_strides, + ptrdiff_t* a_strides, + ptrdiff_t* b_strides +) { + allEqualKernel( + c, + a, + b, + ndim, + total_size, + contiguous_strides, + a_strides, + b_strides + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_all_equal( + const AllEqualInfo &info, + bool * c, + const Tdata * a, + const Tdata * b, + hcStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + for(size_t d = 0; d < ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + + launchKernel<<<1, BLOCK_SIZE, 0, stream>>>( + c, + a, + b, + info.ndim, + total_size, + contiguous_strides_cuda, + a_strides_cuda, + b_strides_cuda + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = a_desc->dtype(); + auto result = AllEqualInfo::createAllEqualInfo( + c_desc, + a_desc, + b_desc + ); + CHECK_RESULT(result); + const AllEqualInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3;; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_EQUAL(TDATA) \ + calculate_all_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + switch (_info.dtype) { + case INFINI_DTYPE_U8: + return CALCULATE_EQUAL(uint8_t); + case INFINI_DTYPE_U16: + return CALCULATE_EQUAL(uint16_t); + case INFINI_DTYPE_U32: + return CALCULATE_EQUAL(uint32_t); + case INFINI_DTYPE_U64: + return CALCULATE_EQUAL(uint64_t); + case INFINI_DTYPE_I8: + return CALCULATE_EQUAL(int8_t); + case INFINI_DTYPE_I16: + return CALCULATE_EQUAL(int16_t); + case INFINI_DTYPE_I32: + return CALCULATE_EQUAL(int32_t); + case INFINI_DTYPE_I64: + return CALCULATE_EQUAL(int64_t); + case INFINI_DTYPE_F16: + return CALCULATE_EQUAL(half); + case INFINI_DTYPE_F32: + return CALCULATE_EQUAL(float); + case INFINI_DTYPE_BF16: + return CALCULATE_EQUAL(cuda_bfloat16); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_EQUAL +} +} // namespace op::all_equal::metax diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu new file mode 100644 index 000000000..4d8331e80 --- /dev/null +++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu @@ -0,0 +1,151 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" +#include "all_equal_nvidia.cuh" + +namespace op::all_equal::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + bool *c, + const Tdata *a, + const Tdata *b, + size_t ndim, + size_t total_size, + ptrdiff_t *contiguous_strides, + ptrdiff_t *a_strides, + ptrdiff_t *b_strides) { + allEqualKernel( + c, + a, + b, + ndim, + total_size, + contiguous_strides, + a_strides, + b_strides); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_all_equal( + const AllEqualInfo &info, + bool *c, + const Tdata *a, + const Tdata *b, + cudaStream_t stream, + void *workspace) { + size_t ndim = info.ndim; + ptrdiff_t *contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + for (size_t d = 0; d < ndim; d++) { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + ptrdiff_t *contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t *a_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t *b_strides_cuda = a_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<<<1, BLOCK_SIZE, 0, stream>>>( + c, + a, + b, + info.ndim, + total_size, + contiguous_strides_cuda, + a_strides_cuda, + b_strides_cuda); + + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + auto handle = reinterpret_cast(handle_); + // --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = a_desc->dtype(); + auto result = AllEqualInfo::createAllEqualInfo( + c_desc, + a_desc, + b_desc); + CHECK_RESULT(result); + const AllEqualInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3; + // ---------------------- end: check data type and calculate workspace size ----------------------- + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream_) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + +#define CALCULATE_EQUAL(TDATA) \ + calculate_all_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + switch (_info.dtype) { + case INFINI_DTYPE_U8: + return CALCULATE_EQUAL(uint8_t); + case INFINI_DTYPE_U16: + return CALCULATE_EQUAL(uint16_t); + case INFINI_DTYPE_U32: + return CALCULATE_EQUAL(uint32_t); + case INFINI_DTYPE_U64: + return CALCULATE_EQUAL(uint64_t); + case INFINI_DTYPE_I8: + return CALCULATE_EQUAL(int8_t); + case INFINI_DTYPE_I16: + return CALCULATE_EQUAL(int16_t); + case INFINI_DTYPE_I32: + return CALCULATE_EQUAL(int32_t); + case INFINI_DTYPE_I64: + return CALCULATE_EQUAL(int64_t); + case INFINI_DTYPE_F16: + return CALCULATE_EQUAL(half); + case INFINI_DTYPE_F32: + return CALCULATE_EQUAL(float); + case INFINI_DTYPE_BF16: + return CALCULATE_EQUAL(cuda_bfloat16); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + +#undef CALCULATE_EQUAL +} +} // namespace op::all_equal::nvidia diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh new file mode 100644 index 000000000..c459e8def --- /dev/null +++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __ALL_EQUAL_NVIDIA_API_H__ +#define __ALL_EQUAL_NVIDIA_API_H__ +#include "../all_equal.h" + +DESCRIPTOR(nvidia) + +#endif // __ALL_EQUAL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/all_equal/operator.cc b/src/infiniop/ops/all_equal/operator.cc new file mode 100644 index 000000000..f89e565a7 --- /dev/null +++ b/src/infiniop/ops/all_equal/operator.cc @@ -0,0 +1,148 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/all_equal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/all_equal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/all_equal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/all_equal_metax.h" +#endif + +__C infiniStatus_t infiniopCreateAllEqualDescriptor( + infiniopHandle_t handle, + infiniopAllEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::all_equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + a_desc, \ + b_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAllEqual( + infiniopAllEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + c, \ + a, \ + b, \ + stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/all_equal.py b/test/infiniop/all_equal.py new file mode 100644 index 000000000..adcf3d366 --- /dev/null +++ b/test/infiniop/all_equal.py @@ -0,0 +1,220 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES_ = [ + # shape, a_stride, b_stride + ((13, 4), None, None), + ((13, 4), (13, 1), (13, 1)), + ((13, 4, 4), (16, 4, 1), (16, 4, 1),), + ((16, 5632), None, None), +] + +class Identical(Enum): + EQUAL = auto() + NOT_EQUAL = auto() + + +_IDENTICAL = [ + Identical.EQUAL, # -> result=true + Identical.NOT_EQUAL, # -> result=false +] + +_TEST_CASES = [ + test_case + (identical_item,) + for test_case in _TEST_CASES_ + for identical_item in _IDENTICAL +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, +} + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_equal(c, a, b): + return torch.tensor(torch.equal(input=a, other=b), dtype=torch.bool) + + +def test( + handle, + device, + input_shape, + a_strides, + b_strides, + identical, + dtype, + sync=None, +): + torch_dtype = { + InfiniDtype.F16: torch.half, + InfiniDtype.F32: torch.float, + InfiniDtype.BF16: torch.bfloat16, + InfiniDtype.I32: torch.int32, + InfiniDtype.I64: torch.int64 + }[dtype] + is_integer_dtype = torch_dtype in (torch.int32, torch.int64) + + print( + f"Testing equal on {InfiniDeviceNames[device]} with input_shape:{input_shape}," + f"a_stride:{a_strides} b_stride:{b_strides} identical:{identical}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + torch_c = torch.tensor([False], dtype=torch.bool) + c = TestTensor( + [1], + torch_c.stride(), + InfiniDtype.BOOL, + device, + "manual", + set_tensor=torch_c + ) + + if a_strides is None: + torch_a = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) + else: + # Allocate storage that can support the requested strides + torch_a = torch.empty_strided(input_shape, a_strides, dtype=torch_dtype) + if is_integer_dtype: + tmp_a = torch.randint(-50, 50, input_shape, dtype=torch_dtype) + torch_a.copy_(tmp_a) + else: + torch_a.uniform_(-50, 50) + a = TestTensor( + input_shape, + torch_a.stride(), + dtype, + device, + "manual", + set_tensor=torch_a + ) + if identical == Identical.EQUAL: + if b_strides is None: + torch_b = torch_a.clone() + else: + # Create b with desired strides and copy values from a to ensure equality + torch_b = torch.empty_strided(input_shape, b_strides, dtype=torch_dtype) + torch_b.copy_((torch_a)) + else: + if b_strides is None: + torch_b = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) + else: + torch_b = torch.empty_strided(input_shape, b_strides, dtype=torch_dtype) + if is_integer_dtype: + tmp_b = torch.randint(-50, 50, input_shape, dtype=torch_dtype) + torch_b.copy_(tmp_b) + else: + torch_b.uniform_(-50, 50) + + b = TestTensor( + input_shape, + torch_b.stride(), + dtype, + device, + "manual", + set_tensor=torch_b + ) + + + c._torch_tensor = torch_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAllEqualDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [c, a, b]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAllEqualWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_equal(): + check_error( + LIBINFINIOP.infiniopAllEqual( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_equal() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor().to(torch.uint8), c.torch_tensor().to(torch.uint8), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_equal( + c.torch_tensor(), a.torch_tensor(), b.torch_tensor() + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyAllEqualDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my all_equal passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index ba1ce33df..a7ca1a113 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -583,3 +583,32 @@ def softplus_(lib): ] lib.infiniopDestroySoftplusDescriptor.restype = c_int32 lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def all_equal_(lib): + lib.infiniopCreateAllEqualDescriptor.restype = c_int32 + lib.infiniopCreateAllEqualDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetAllEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetAllEqualWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAllEqual.restype = c_int32 + lib.infiniopAllEqual.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyAllEqualDescriptor.restype = c_int32 + lib.infiniopDestroyAllEqualDescriptor.argtypes = [infiniopOperatorDescriptor_t] + diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 162b199fe..8c7570d41 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -139,6 +139,8 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum): def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): + if dt == InfiniDtype.BOOL: # support torch.bool input dtype + return torch.bool if dt == InfiniDtype.I8: return torch.int8 elif dt == InfiniDtype.I16: @@ -269,7 +271,7 @@ def rearrange_tensor(tensor, new_strides): new_positions += offset # Copy the original data to the new tensor - new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1)) + new_tensor.view(-1).index_add_(0, new_positions, tensor.reshape(-1)) new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides)) return new_tensor