From 454285c7014001e34991657205e8d805769d8758 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:01:49 +0800 Subject: [PATCH 01/16] [T1-1-1]: register new operators in framework --- include/infiniop.h | 9 + scripts/python_test.py | 9 + src/infiniop-test/include/ops.hpp | 18 ++ test/infiniop/libinfiniop/op_register.py | 287 +++++++++++++++++++++++ 4 files changed, 323 insertions(+) diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..c86127cb2 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -16,6 +16,15 @@ #include "infiniop/ops/rope.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" +#include "infiniop/ops/exp.h" +#include "infiniop/ops/sin.h" +#include "infiniop/ops/cos.h" +#include "infiniop/ops/leakyrelu.h" +#include "infiniop/ops/tanh.h" +#include "infiniop/ops/sigmoid_backward.h" +#include "infiniop/ops/hardswish.h" +#include "infiniop/ops/cast.h" +#include "infiniop/ops/where.h" #include "infiniop/tensor_descriptor.h" #endif // __INFINIOP_API_H__ diff --git a/scripts/python_test.py b/scripts/python_test.py index eb2d4319e..02fd65c63 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -24,6 +24,15 @@ def run_tests(args): "rope.py", "sub.py", "swiglu.py", + "exp.py", + "sin.py", + "cos.py", + "leakyrelu.py", + "tanh.py", + "sigmoid_backward.py", + "hardswish.py", + "cast.py", + "where.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..4c16eeec7 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,15 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(exp) +DECLARE_INFINIOP_TEST(sin) +DECLARE_INFINIOP_TEST(cos) +DECLARE_INFINIOP_TEST(leakyrelu) +DECLARE_INFINIOP_TEST(tanh) +DECLARE_INFINIOP_TEST(sigmoid_backward) +DECLARE_INFINIOP_TEST(hardswish) +DECLARE_INFINIOP_TEST(cast) +DECLARE_INFINIOP_TEST(where) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -43,6 +52,15 @@ DECLARE_INFINIOP_TEST(sub) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(exp) \ + REGISTER_INFINIOP_TEST(sin) \ + REGISTER_INFINIOP_TEST(cos) \ + REGISTER_INFINIOP_TEST(leakyrelu) \ + REGISTER_INFINIOP_TEST(tanh) \ + REGISTER_INFINIOP_TEST(sigmoid_backward)\ + REGISTER_INFINIOP_TEST(hardswish) \ + REGISTER_INFINIOP_TEST(cast) \ + REGISTER_INFINIOP_TEST(where) \ } namespace infiniop_test { diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..86cee0424 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -489,3 +489,290 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + +@OpRegister.operator +def exp_(lib): + lib.infiniopCreateExpDescriptor.restype = c_int32 + lib.infiniopCreateExpDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetExpWorkspaceSize.restype = c_int32 + lib.infiniopGetExpWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopExp.restype = c_int32 + lib.infiniopExp.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyExpDescriptor.restype = c_int32 + lib.infiniopDestroyExpDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def sin_(lib): + lib.infiniopCreateSinDescriptor.restype = c_int32 + lib.infiniopCreateSinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetSinWorkspaceSize.restype = c_int32 + lib.infiniopGetSinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopSin.restype = c_int32 + lib.infiniopSin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroySinDescriptor.restype = c_int32 + lib.infiniopDestroySinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def cos_(lib): + lib.infiniopCreateCosDescriptor.restype = c_int32 + lib.infiniopCreateCosDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetCosWorkspaceSize.restype = c_int32 + lib.infiniopGetCosWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCos.restype = c_int32 + lib.infiniopCos.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyCosDescriptor.restype = c_int32 + lib.infiniopDestroyCosDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def leakyrelu_(lib): + lib.infiniopCreateLeakyreluDescriptor.restype = c_int32 + lib.infiniopCreateLeakyreluDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + ] + + lib.infiniopGetLeakyreluWorkspaceSize.restype = c_int32 + lib.infiniopGetLeakyreluWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLeakyrelu.restype = c_int32 + lib.infiniopLeakyrelu.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLeakyreluDescriptor.restype = c_int32 + lib.infiniopDestroyLeakyreluDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def tanh_(lib): + lib.infiniopCreateTanhDescriptor.restype = c_int32 + lib.infiniopCreateTanhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetTanhWorkspaceSize.restype = c_int32 + lib.infiniopGetTanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopTanh.restype = c_int32 + lib.infiniopTanh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyTanhDescriptor.restype = c_int32 + lib.infiniopDestroyTanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def sigmoid_backward_(lib): + lib.infiniopCreateSigmoidBackwardDescriptor.restype = c_int32 + lib.infiniopCreateSigmoidBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetSigmoidBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetSigmoidBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopSigmoidBackward.restype = c_int32 + lib.infiniopSigmoidBackward.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroySigmoidBackwardDescriptor.restype = c_int32 + lib.infiniopDestroySigmoidBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def hardswish_(lib): + lib.infiniopCreateHardswishDescriptor.restype = c_int32 + lib.infiniopCreateHardswishDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetHardswishWorkspaceSize.restype = c_int32 + lib.infiniopGetHardswishWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopHardswish.restype = c_int32 + lib.infiniopHardswish.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyHardswishDescriptor.restype = c_int32 + lib.infiniopDestroyHardswishDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def cast_(lib): + lib.infiniopCreateCastDescriptor.restype = c_int32 + lib.infiniopCreateCastDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetCastWorkspaceSize.restype = c_int32 + lib.infiniopGetCastWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCast.restype = c_int32 + lib.infiniopCast.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyCastDescriptor.restype = c_int32 + lib.infiniopDestroyCastDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def where_(lib): + lib.infiniopCreateWhereDescriptor.restype = c_int32 + lib.infiniopCreateWhereDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetWhereWorkspaceSize.restype = c_int32 + lib.infiniopGetWhereWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopWhere.restype = c_int32 + lib.infiniopWhere.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyWhereDescriptor.restype = c_int32 + lib.infiniopDestroyWhereDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] \ No newline at end of file From 381ccbc646594fb86f8b61c2aebfd973ffc3f6e9 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:06:58 +0800 Subject: [PATCH 02/16] [T1-1-1]: Exp operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/exp.h | 24 +++ src/infiniop-test/src/ops/exp.cpp | 115 ++++++++++++++ src/infiniop/ops/exp/cpu/exp_cpu.cc | 52 +++++++ src/infiniop/ops/exp/cpu/exp_cpu.h | 21 +++ src/infiniop/ops/exp/cuda/kernel.cuh | 39 +++++ src/infiniop/ops/exp/metax/exp_metax.h | 8 + src/infiniop/ops/exp/metax/exp_metax.maca | 60 ++++++++ src/infiniop/ops/exp/nvidia/exp_nvidia.cu | 59 ++++++++ src/infiniop/ops/exp/nvidia/exp_nvidia.cuh | 8 + src/infiniop/ops/exp/operator.cc | 142 ++++++++++++++++++ test/infiniop/exp.py | 165 +++++++++++++++++++++ 11 files changed, 693 insertions(+) create mode 100644 include/infiniop/ops/exp.h create mode 100644 src/infiniop-test/src/ops/exp.cpp create mode 100644 src/infiniop/ops/exp/cpu/exp_cpu.cc create mode 100644 src/infiniop/ops/exp/cpu/exp_cpu.h create mode 100644 src/infiniop/ops/exp/cuda/kernel.cuh create mode 100644 src/infiniop/ops/exp/metax/exp_metax.h create mode 100644 src/infiniop/ops/exp/metax/exp_metax.maca create mode 100644 src/infiniop/ops/exp/nvidia/exp_nvidia.cu create mode 100644 src/infiniop/ops/exp/nvidia/exp_nvidia.cuh create mode 100644 src/infiniop/ops/exp/operator.cc create mode 100644 test/infiniop/exp.py diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h new file mode 100644 index 000000000..624bc5363 --- /dev/null +++ b/include/infiniop/ops/exp.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_EXP_API_H__ +#define __INFINIOP_EXP_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopExpDescriptor_t; + +__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp new file mode 100644 index 000000000..395408e15 --- /dev/null +++ b/src/infiniop-test/src/ops/exp.cpp @@ -0,0 +1,115 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::exp { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopExpDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateExpDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetExpWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopExp(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopExp( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::exp diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc new file mode 100644 index 000000000..58a6d0f2d --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc @@ -0,0 +1,52 @@ +#include "exp_cpu.h" + +namespace op::exp::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::cpu diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h new file mode 100644 index 000000000..fbf9ab126 --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.h @@ -0,0 +1,21 @@ +#ifndef __EXP_CPU_H__ +#define __EXP_CPU_H__ + +#include +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(exp, cpu) + +namespace op::exp::cpu { +typedef struct ExpOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::exp(input); + } +} ExpOp; +} // namespace op::exp::cpu + +#endif // __EXP_CPU_H__ diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh new file mode 100644 index 000000000..316a393be --- /dev/null +++ b/src/infiniop/ops/exp/cuda/kernel.cuh @@ -0,0 +1,39 @@ +#ifndef __EXP_CUDA_H__ +#define __EXP_CUDA_H__ + +#include +#include +#include + +namespace op::exp::cuda { +typedef struct ExpOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__expf(vf.x), __expf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__expf(f0), __expf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + return __expf(input); + } else if constexpr (std::is_same_v) { + return std::exp(input); + } else { + return std::exp(input); + } + } +} ExpOp; +} // namespace + +#endif // __EXP_CUDA_H__ diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h new file mode 100644 index 000000000..fb10faf9b --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.h @@ -0,0 +1,8 @@ +#ifndef __EXP_METAX_API_H__ +#define __EXP_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(exp, metax) + +#endif // __EXP_METAX_API_H__ diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca new file mode 100644 index 000000000..c71703c6d --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.maca @@ -0,0 +1,60 @@ +#include "exp_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::exp::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::metax diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu new file mode 100644 index 000000000..f4229a942 --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "exp_nvidia.cuh" + +namespace op::exp::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::nvidia diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh new file mode 100644 index 000000000..7545e8f3e --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EXP_CUDA_API_H__ +#define __EXP_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(exp, nvidia) + +#endif // __EXP_CUDA_API_H__ diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc new file mode 100644 index 000000000..56f5d29cd --- /dev/null +++ b/src/infiniop/ops/exp/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/exp.h" + +#ifdef ENABLE_CPU_API +#include "cpu/exp_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/exp_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/exp_metax.h" +#endif + +__C infiniStatus_t infiniopCreateExpDescriptor( + infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::exp::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) \ + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopExp( + infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py new file mode 100644 index 000000000..eb139af12 --- /dev/null +++ b/test/infiniop/exp.py @@ -0,0 +1,165 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def exp(output, input): + output.copy_(torch.exp(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + exp(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateExpDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetExpWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_exp(): + check_error( + LIBINFINIOP.infiniopExp( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_exp() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From 3e1f1dc1f6a616b29657f96b144c93bb445a4a3e Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:08:39 +0800 Subject: [PATCH 03/16] [T1-1-1]: Sin operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/sin.h | 24 +++ src/infiniop-test/src/ops/sin.cpp | 114 ++++++++++++++ src/infiniop/ops/sin/cpu/sin_cpu.cc | 52 +++++++ src/infiniop/ops/sin/cpu/sin_cpu.h | 21 +++ src/infiniop/ops/sin/cuda/kernel.cuh | 39 +++++ src/infiniop/ops/sin/metax/sin_metax.h | 8 + src/infiniop/ops/sin/metax/sin_metax.maca | 60 ++++++++ src/infiniop/ops/sin/nvidia/sin_nvidia.cu | 59 ++++++++ src/infiniop/ops/sin/nvidia/sin_nvidia.cuh | 8 + src/infiniop/ops/sin/operator.cc | 142 ++++++++++++++++++ test/infiniop/sin.py | 166 +++++++++++++++++++++ 11 files changed, 693 insertions(+) create mode 100644 include/infiniop/ops/sin.h create mode 100644 src/infiniop-test/src/ops/sin.cpp create mode 100644 src/infiniop/ops/sin/cpu/sin_cpu.cc create mode 100644 src/infiniop/ops/sin/cpu/sin_cpu.h create mode 100644 src/infiniop/ops/sin/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sin/metax/sin_metax.h create mode 100644 src/infiniop/ops/sin/metax/sin_metax.maca create mode 100644 src/infiniop/ops/sin/nvidia/sin_nvidia.cu create mode 100644 src/infiniop/ops/sin/nvidia/sin_nvidia.cuh create mode 100644 src/infiniop/ops/sin/operator.cc create mode 100644 test/infiniop/sin.py diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h new file mode 100644 index 000000000..640deccc0 --- /dev/null +++ b/include/infiniop/ops/sin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SIN_API_H__ +#define __INFINIOP_SIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp new file mode 100644 index 000000000..db256c283 --- /dev/null +++ b/src/infiniop-test/src/ops/sin.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::sin { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopSinDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateSinDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetSinWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopSin(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopSin( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::sin diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc new file mode 100644 index 000000000..88ba6cdd6 --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc @@ -0,0 +1,52 @@ +#include "sin_cpu.h" + +namespace op::sin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::cpu diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h new file mode 100644 index 000000000..e221c2573 --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.h @@ -0,0 +1,21 @@ +#ifndef __SIN_CPU_H__ +#define __SIN_CPU_H__ + +#include +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sin, cpu) + +namespace op::sin::cpu { +typedef struct SinOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::sin(input); + } +} SinOp; +} // namespace op::sin::cpu + +#endif // __SIN_CPU_H__ diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh new file mode 100644 index 000000000..c9993ca12 --- /dev/null +++ b/src/infiniop/ops/sin/cuda/kernel.cuh @@ -0,0 +1,39 @@ +#ifndef __SIN_CUDA_H__ +#define __SIN_CUDA_H__ + +#include +#include +#include + +namespace op::sin::cuda { +typedef struct SinOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(sinf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__sinf(f0), __sinf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__sinf(inputf)); + } else if constexpr (std::is_same_v) { + return sinf(input); + } else if constexpr (std::is_same_v) { + return std::sin(input); + } else { + return std::sin(input); + } + } +} SinOp; +} // namespace op::sin::cuda + +#endif // __SIN_CUDA_H__ diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h new file mode 100644 index 000000000..5b272d4d9 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.h @@ -0,0 +1,8 @@ +#ifndef __SIN_METAX_API_H__ +#define __SIN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(sin, metax) + +#endif // __SIN_METAX_API_H__ diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca new file mode 100644 index 000000000..5ea69e139 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.maca @@ -0,0 +1,60 @@ +#include "sin_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::sin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::metax diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu new file mode 100644 index 000000000..eaac7a582 --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sin_nvidia.cuh" + +namespace op::sin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::nvidia diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh new file mode 100644 index 000000000..31f5b48ef --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIN_CUDA_API_H__ +#define __SIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sin, nvidia) + +#endif // __SIN_CUDA_API_H__ diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc new file mode 100644 index 000000000..38d8b242c --- /dev/null +++ b/src/infiniop/ops/sin/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sin_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSinDescriptor( + infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) \ + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSin( + infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py new file mode 100644 index 000000000..613257e9c --- /dev/null +++ b/test/infiniop/sin.py @@ -0,0 +1,166 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sin(output, input): + output.copy_(torch.sin(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + sin(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSinDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_sin(): + check_error( + LIBINFINIOP.infiniopSin( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_sin() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sin(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From 0ad8416e4ba757263aea672bce7190753f301984 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:09:56 +0800 Subject: [PATCH 04/16] [T1-1-1]: Cos operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/cos.h | 24 +++ src/infiniop-test/src/ops/cos.cpp | 114 ++++++++++++++ src/infiniop/ops/cos/cpu/cos_cpu.cc | 52 +++++++ src/infiniop/ops/cos/cpu/cos_cpu.h | 21 +++ src/infiniop/ops/cos/cuda/kernel.cuh | 49 ++++++ src/infiniop/ops/cos/metax/cos_metax.h | 8 + src/infiniop/ops/cos/metax/cos_metax.maca | 60 ++++++++ src/infiniop/ops/cos/nvidia/cos_nvidia.cu | 59 ++++++++ src/infiniop/ops/cos/nvidia/cos_nvidia.cuh | 8 + src/infiniop/ops/cos/operator.cc | 142 ++++++++++++++++++ test/infiniop/cos.py | 166 +++++++++++++++++++++ 11 files changed, 703 insertions(+) create mode 100644 include/infiniop/ops/cos.h create mode 100644 src/infiniop-test/src/ops/cos.cpp create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.cc create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.h create mode 100644 src/infiniop/ops/cos/cuda/kernel.cuh create mode 100644 src/infiniop/ops/cos/metax/cos_metax.h create mode 100644 src/infiniop/ops/cos/metax/cos_metax.maca create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cu create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh create mode 100644 src/infiniop/ops/cos/operator.cc create mode 100644 test/infiniop/cos.py diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h new file mode 100644 index 000000000..aeb551e77 --- /dev/null +++ b/include/infiniop/ops/cos.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_COS_API_H__ +#define __INFINIOP_COS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCosDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/cos.cpp b/src/infiniop-test/src/ops/cos.cpp new file mode 100644 index 000000000..7cae4574d --- /dev/null +++ b/src/infiniop-test/src/ops/cos.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cos { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopCosDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateCosDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetCosWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopCos(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopCos( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::cos diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc new file mode 100644 index 000000000..f5d27ec49 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc @@ -0,0 +1,52 @@ +#include "cos_cpu.h" + +namespace op::cos::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::cpu diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h new file mode 100644 index 000000000..37efb7597 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -0,0 +1,21 @@ +#ifndef __COS_CPU_H__ +#define __COS_CPU_H__ + +#include +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(cos, cpu) + +namespace op::cos::cpu { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::cos(input); + } +} CosOp; +} // namespace op::cos::cpu + +#endif // __COS_CPU_H__ diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh new file mode 100644 index 000000000..381a897f0 --- /dev/null +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -0,0 +1,49 @@ +#ifndef __COS_CUDA_H__ +#define __COS_CUDA_H__ + +#include +#include +#include + +namespace op::cos::cuda { +typedef struct CosOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + auto cos_f32 = [] __device__ (float x) { + double xd = static_cast(x); + double yd = std::cos(xd); + return static_cast(yd); + }; + + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2( + cos_f32(vf.x), + cos_f32(vf.y) + ); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = cos_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rz(cos_f32(f0), cos_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + return __float2bfloat16_rz(cos_f32(xf)); + } else if constexpr (std::is_same_v) { + return cos_f32(input); + } else if constexpr (std::is_same_v) { + return std::cos(input); + } else { + return std::cos(input); + } + } +} CosOp; +} // namespace op::cos::cuda + +#endif // __COS_CUDA_H__ diff --git a/src/infiniop/ops/cos/metax/cos_metax.h b/src/infiniop/ops/cos/metax/cos_metax.h new file mode 100644 index 000000000..a98fa3211 --- /dev/null +++ b/src/infiniop/ops/cos/metax/cos_metax.h @@ -0,0 +1,8 @@ +#ifndef __COS_METAX_API_H__ +#define __COS_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(cos, metax) + +#endif // __COS_METAX_API_H__ diff --git a/src/infiniop/ops/cos/metax/cos_metax.maca b/src/infiniop/ops/cos/metax/cos_metax.maca new file mode 100644 index 000000000..144db47ef --- /dev/null +++ b/src/infiniop/ops/cos/metax/cos_metax.maca @@ -0,0 +1,60 @@ +#include "cos_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cos::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::metax diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu new file mode 100644 index 000000000..a3c38bc89 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cos_nvidia.cuh" + +namespace op::cos::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::nvidia diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh new file mode 100644 index 000000000..f6c350dd6 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __COS_CUDA_API_H__ +#define __COS_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cos, nvidia) + +#endif // __COS_CUDA_API_H__ diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc new file mode 100644 index 000000000..11781d591 --- /dev/null +++ b/src/infiniop/ops/cos/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cos.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cos_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cos_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cos_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCosDescriptor( + infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) \ + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCos( + infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py new file mode 100644 index 000000000..d1d94db3a --- /dev/null +++ b/test/infiniop/cos.py @@ -0,0 +1,166 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cos(output, input): + output.copy_(torch.cos(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + cos(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCosDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCosWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_cos(): + check_error( + LIBINFINIOP.infiniopCos( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_cos() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cos(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From b41cbd2bd637dded8c9c18144ffd007273f22cb9 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:11:08 +0800 Subject: [PATCH 05/16] [T1-1-1]: Leakyrelu operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/leakyrelu.h | 25 +++ src/infiniop-test/src/ops/leakyrelu.cpp | 123 ++++++++++++ .../ops/leakyrelu/cpu/leakyrelu_cpu.cc | 104 ++++++++++ .../ops/leakyrelu/cpu/leakyrelu_cpu.h | 7 + src/infiniop/ops/leakyrelu/cuda/kernel.cuh | 69 +++++++ src/infiniop/ops/leakyrelu/info.h | 52 +++++ src/infiniop/ops/leakyrelu/leakyrelu.h | 49 +++++ .../ops/leakyrelu/metax/leakyrelu_metax.h | 8 + .../ops/leakyrelu/metax/leakyrelu_metax.maca | 174 +++++++++++++++++ .../ops/leakyrelu/nvidia/leakyrelu_nvidia.cu | 178 ++++++++++++++++++ .../ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh | 8 + src/infiniop/ops/leakyrelu/operator.cc | 164 ++++++++++++++++ test/infiniop/leakyrelu.py | 168 +++++++++++++++++ 13 files changed, 1129 insertions(+) create mode 100644 include/infiniop/ops/leakyrelu.h create mode 100644 src/infiniop-test/src/ops/leakyrelu.cpp create mode 100644 src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc create mode 100644 src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h create mode 100644 src/infiniop/ops/leakyrelu/cuda/kernel.cuh create mode 100644 src/infiniop/ops/leakyrelu/info.h create mode 100644 src/infiniop/ops/leakyrelu/leakyrelu.h create mode 100644 src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h create mode 100644 src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca create mode 100644 src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu create mode 100644 src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh create mode 100644 src/infiniop/ops/leakyrelu/operator.cc create mode 100644 test/infiniop/leakyrelu.py diff --git a/include/infiniop/ops/leakyrelu.h b/include/infiniop/ops/leakyrelu.h new file mode 100644 index 000000000..9ce93d53c --- /dev/null +++ b/include/infiniop/ops/leakyrelu.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_LEAKYRELU_API_H__ +#define __INFINIOP_LEAKYRELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLeakyreluDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLeakyreluDescriptor(infiniopHandle_t handle, + infiniopLeakyreluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float negative_slope); + +__C __export infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/leakyrelu.cpp b/src/infiniop-test/src/ops/leakyrelu.cpp new file mode 100644 index 000000000..c63741120 --- /dev/null +++ b/src/infiniop-test/src/ops/leakyrelu.cpp @@ -0,0 +1,123 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::leakyrelu { +struct Test::Attributes { + float negative_slope; + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (attributes.find("negative_slope") == attributes.end() + || tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->negative_slope = *reinterpret_cast(attributes["negative_slope"].data()); + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLeakyreluDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateLeakyreluDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + _attributes->negative_slope), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetLeakyreluWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); + } + CHECK_OR(infiniopLeakyrelu(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLeakyrelu( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"negative_slope"}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- negative_slope=" << _attributes->negative_slope << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::leakyrelu diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc new file mode 100644 index 000000000..cd56f0ca6 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc @@ -0,0 +1,104 @@ +#include "leakyrelu_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../info.h" +#include "infinicore.h" +#include + +namespace op::leakyrelu::cpu { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + float negative_slope) { + + auto handle = reinterpret_cast(handle_); + + auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope); + CHECK_RESULT(info_r); + + *desc_ptr = new Descriptor( + info_r.take(), + 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { return _min_workspace_size; } + +template +static inline void cpu_leakyrelu_impl_incremental( + void *output, const void *input, const op::leakyrelu::LeakyReLUInfo &info) { + + const size_t ndim = info.shape.size(); + const size_t n = info.n; + + if (n == 0) return; + + auto out_base = reinterpret_cast(output); + auto in_base = reinterpret_cast(input); + + const std::vector &shape = info.shape; + const std::vector &in_stride = info.in_stride; + const std::vector &out_stride = info.out_stride; + + std::vector idx(ndim, 0); + ptrdiff_t in_off = 0; + ptrdiff_t out_off = 0; + + for (size_t it = 0; it < n; ++it) { + const T *in_elem = in_base + in_off; + T *out_elem = out_base + out_off; + + float v = utils::cast(*in_elem); + float outv = v >= 0.0f ? v : v * info.negative_slope; + *out_elem = utils::cast(outv); + for (int d = static_cast(ndim) - 1; d >= 0; --d) { + idx[d] += 1; + if (in_stride[d] != 0) in_off += in_stride[d]; + if (out_stride[d] != 0) out_off += out_stride[d]; + + if (idx[d] < shape[d]) { + break; + } else { + idx[d] = 0; + if (in_stride[d] != 0) in_off -= static_cast(shape[d]) * in_stride[d]; + if (out_stride[d] != 0) out_off -= static_cast(shape[d]) * out_stride[d]; + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + switch (_info.dt_in) { + case INFINI_DTYPE_F16: + cpu_leakyrelu_impl_incremental(output, input, _info); + break; + case INFINI_DTYPE_BF16: + cpu_leakyrelu_impl_incremental(output, input, _info); + break; + case INFINI_DTYPE_F32: + cpu_leakyrelu_impl_incremental(output, input, _info); + break; + case INFINI_DTYPE_F64: + cpu_leakyrelu_impl_incremental(output, input, _info); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::leakyrelu::cpu diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h new file mode 100644 index 000000000..e58ca1409 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h @@ -0,0 +1,7 @@ +#ifndef __LEAKYRELU_CPU_H__ +#define __LEAKYRELU_CPU_H__ +#include "../leakyrelu.h" + +DESCRIPTOR(cpu) + +#endif // __LEAKYRELU_CPU_H__ diff --git a/src/infiniop/ops/leakyrelu/cuda/kernel.cuh b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh new file mode 100644 index 000000000..abad71b6a --- /dev/null +++ b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh @@ -0,0 +1,69 @@ +#ifndef __LEAKYRELU_CUDA_KERNEL_CUH__ +#define __LEAKYRELU_CUDA_KERNEL_CUH__ + +#include +#include +#include +#include + +template +__device__ __forceinline__ float to_float_for_leaky(const DevT &v) { + if constexpr (std::is_same_v) { + return __half2float(v); + } else if constexpr (std::is_same_v) { + return __bfloat162float(v); + } else { + return static_cast(v); + } +} + +template +__device__ __forceinline__ DevT from_float_for_leaky(float f) { + if constexpr (std::is_same_v) { + return __float2half_rn(f); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(f); + } else { + return static_cast(f); + } +} + +template +__global__ void leakyrelu_kernel( + DevT *__restrict__ out, + const DevT *__restrict__ in, + size_t n, + float negative_slope, + const size_t *__restrict__ shape, + const size_t *__restrict__ div, + const long long *__restrict__ in_stride, + const long long *__restrict__ out_stride, + int ndim) { + + size_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t grid_stride = static_cast(blockDim.x) * gridDim.x; + + for (size_t linear = gid; linear < n; linear += grid_stride) { + unsigned long long rem = linear; + long long in_off = 0; + long long out_off = 0; + for (int d = 0; d < ndim; ++d) { + unsigned long long idx_d = 0; + size_t divisor = div[d]; + if (divisor != 0) { + idx_d = rem / divisor; + rem = rem % divisor; + } else { + idx_d = 0; + } + if (in_stride[d] != 0) in_off += static_cast(idx_d) * in_stride[d]; + if (out_stride[d] != 0) out_off += static_cast(idx_d) * out_stride[d]; + } + + float v = to_float_for_leaky(in[static_cast(in_off)]); + float outv = v >= 0.0f ? v : v * negative_slope; + out[static_cast(out_off)] = from_float_for_leaky(outv); + } +} + +#endif // __LEAKYRELU_CUDA_KERNEL_CUH__ diff --git a/src/infiniop/ops/leakyrelu/info.h b/src/infiniop/ops/leakyrelu/info.h new file mode 100644 index 000000000..dd0a2d3ad --- /dev/null +++ b/src/infiniop/ops/leakyrelu/info.h @@ -0,0 +1,52 @@ +#ifndef __LEAKYRELU_INFO_H__ +#define __LEAKYRELU_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::leakyrelu { + +class LeakyReLUInfo { + LeakyReLUInfo() = default; + +public: + infiniDtype_t dt_in; + std::vector shape; + std::vector in_stride; + std::vector out_stride; + size_t n; + float negative_slope; + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + float negative_slope) { + + auto dt_raw = in_desc->dtype(); + infiniDtype_t dt_in = dt_raw; + + CHECK_DTYPE(dt_in, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_OR_RETURN(out_desc->ndim() == in_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); + for (size_t i = 0; i < out_desc->ndim(); ++i) { + CHECK_OR_RETURN(out_desc->dim(i) == in_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE); + } + + size_t n = 1; + for (size_t i = 0; i < in_desc->ndim(); ++i) n *= static_cast(in_desc->dim(i)); + + return utils::Result(LeakyReLUInfo{ + dt_in, + out_desc->shape(), + in_desc->strides(), + out_desc->strides(), + n, + negative_slope + }); + } +}; + +} // namespace op::leakyrelu + +#endif // __LEAKYRELU_INFO_H__ diff --git a/src/infiniop/ops/leakyrelu/leakyrelu.h b/src/infiniop/ops/leakyrelu/leakyrelu.h new file mode 100644 index 000000000..a6a01a85b --- /dev/null +++ b/src/infiniop/ops/leakyrelu/leakyrelu.h @@ -0,0 +1,49 @@ +#ifndef __LEAKYRELU_H__ +#define __LEAKYRELU_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::leakyrelu::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + \ + LeakyReLUInfo _info; \ + size_t _min_workspace_size; \ + \ + Descriptor( \ + LeakyReLUInfo info, \ + size_t min_workspace_size, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _min_workspace_size(min_workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t in_desc, \ + float negative_slope); \ + \ + size_t workspaceSize() const; \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __LEAKYRELU_H__ diff --git a/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h new file mode 100644 index 000000000..15cdccc61 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h @@ -0,0 +1,8 @@ +#ifndef __LEAKYRELU_METAX_API_H__ +#define __LEAKYRELU_METAX_API_H__ + +#include "../leakyrelu.h" + +DESCRIPTOR(metax) + +#endif // __LEAKYRELU_METAX_API_H__ diff --git a/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca new file mode 100644 index 000000000..871c3f663 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca @@ -0,0 +1,174 @@ +#include "../cuda/kernel.cuh" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../leakyrelu.h" +#include "leakyrelu_metax.h" +#include "../info.h" + +namespace op::leakyrelu::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +template struct MapHcType { using Type = T; }; +template <> struct MapHcType { using Type = half; }; +#if defined(__HC_BF16_TYPES_EXIST__) || defined(__HC_ARCH__) +template <> struct MapHcType { using Type = __nv_bfloat16; }; +#endif + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + float negative_slope) { + auto handle = reinterpret_cast(handle_); + + auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope); + CHECK_RESULT(info_r); + auto info = info_r.take(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info, + workspace_size, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline infiniStatus_t metax_leakyrelu_impl_incremental( + void *output_, const void *input_, + const op::leakyrelu::LeakyReLUInfo &info, + void *stream_) { + + int bs = 256, grid = 0; + hcError_t propErr; + int device_id_local = 0; + using DevT = typename MapHcType::Type; + + auto out_dev = reinterpret_cast(output_); + auto in_dev = reinterpret_cast(input_); + auto stream = reinterpret_cast(stream_); + + int ndim = static_cast(info.shape.size()); + if (ndim == 0) { + return INFINI_STATUS_SUCCESS; + } + + std::vector h_shape(info.shape.begin(), info.shape.end()); + std::vector h_div(ndim); + h_div[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + h_div[d] = h_div[d + 1] * h_shape[d + 1]; + } + + std::vector h_in_stride(ndim), h_out_stride(ndim); + for (int d = 0; d < ndim; ++d) { + h_in_stride[d] = static_cast(info.in_stride[d]); + h_out_stride[d] = static_cast(info.out_stride[d]); + } + + size_t *d_shape = nullptr; + size_t *d_div = nullptr; + long long *d_in_stride = nullptr; + long long *d_out_stride = nullptr; + + hcError_t err = hcSuccess; + + err = hcMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + + device_id_local = 0; + propErr = hcGetDevice(&device_id_local); + if (propErr == hcSuccess) { + hcDeviceProp_t prop; + if (hcGetDeviceProperties(&prop, device_id_local) == hcSuccess) { + bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); + } else { + if (bs > 256) bs = 256; + } + } else { + if (bs > 256) bs = 256; + } + + if (bs <= 0) bs = 256; + grid = static_cast((info.n + bs - 1) / bs); + if (grid <= 0) grid = 1; + + leakyrelu_kernel<<>>( + out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim); + + err = hcGetLastError(); + if (err != hcSuccess) goto cleanup; + + err = hcStreamSynchronize(stream); + if (err != hcSuccess) goto cleanup; + + hcFree(d_shape); + hcFree(d_div); + hcFree(d_in_stride); + hcFree(d_out_stride); + return INFINI_STATUS_SUCCESS; + +cleanup: + hcFree(d_shape); + hcFree(d_div); + hcFree(d_in_stride); + hcFree(d_out_stride); + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + switch (_info.dt_in) { + case INFINI_DTYPE_F16: + metax_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + metax_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + metax_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + metax_leakyrelu_impl_incremental(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +}; // namespace op::leakyrelu::metax diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu new file mode 100644 index 000000000..05d149d5e --- /dev/null +++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu @@ -0,0 +1,178 @@ +#include "../cuda/kernel.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../leakyrelu.h" +#include "leakyrelu_nvidia.cuh" +#include "../info.h" +#include +#include +#include +#include + +namespace op::leakyrelu::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +template struct MapCudaType { using Type = T; }; +template <> struct MapCudaType { using Type = half; }; +#if defined(__CUDA_BF16_TYPES_EXIST__) || defined(__CUDA_ARCH__) +template <> struct MapCudaType { using Type = __nv_bfloat16; }; +#endif + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + float negative_slope) { + auto handle = reinterpret_cast(handle_); + + auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope); + CHECK_RESULT(info_r); + auto info = info_r.take(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info, + workspace_size, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline infiniStatus_t cuda_leakyrelu_impl_incremental( + void *output_, const void *input_, + const op::leakyrelu::LeakyReLUInfo &info, + void *stream_) { + + int bs = 256, grid = 0; + cudaError_t propErr; + int device_id_local = 0; + using DevT = typename MapCudaType::Type; + + auto out_dev = reinterpret_cast(output_); + auto in_dev = reinterpret_cast(input_); + auto stream = reinterpret_cast(stream_); + + int ndim = static_cast(info.shape.size()); + if (ndim == 0) { + return INFINI_STATUS_SUCCESS; + } + + std::vector h_shape(info.shape.begin(), info.shape.end()); + std::vector h_div(ndim); + h_div[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + h_div[d] = h_div[d + 1] * h_shape[d + 1]; + } + + std::vector h_in_stride(ndim), h_out_stride(ndim); + for (int d = 0; d < ndim; ++d) { + h_in_stride[d] = static_cast(info.in_stride[d]); + h_out_stride[d] = static_cast(info.out_stride[d]); + } + + size_t *d_shape = nullptr; + size_t *d_div = nullptr; + long long *d_in_stride = nullptr; + long long *d_out_stride = nullptr; + + cudaError_t err = cudaSuccess; + + err = cudaMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); + if (err != cudaSuccess) goto cleanup; + err = cudaMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); + if (err != cudaSuccess) goto cleanup; + err = cudaMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); + if (err != cudaSuccess) goto cleanup; + err = cudaMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); + if (err != cudaSuccess) goto cleanup; + err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) goto cleanup; + err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) goto cleanup; + err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) goto cleanup; + err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) goto cleanup; + + device_id_local = 0; + propErr = cudaGetDevice(&device_id_local); + if (propErr == cudaSuccess) { + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) { + bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); + } else { + if (bs > 256) bs = 256; + } + } else { + if (bs > 256) bs = 256; + } + + if (bs <= 0) bs = 256; + grid = static_cast((info.n + bs - 1) / bs); + if (grid <= 0) grid = 1; + + leakyrelu_kernel<<>>( + out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim); + + err = cudaGetLastError(); + if (err != cudaSuccess) goto cleanup; + + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) goto cleanup; + + cudaFree(d_shape); + cudaFree(d_div); + cudaFree(d_in_stride); + cudaFree(d_out_stride); + return INFINI_STATUS_SUCCESS; + +cleanup: + cudaFree(d_shape); + cudaFree(d_div); + cudaFree(d_in_stride); + cudaFree(d_out_stride); + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + switch (_info.dt_in) { + case INFINI_DTYPE_F16: + cuda_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + cuda_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + cuda_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + cuda_leakyrelu_impl_incremental(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +}; // namespace op::leakyrelu::nvidia diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh new file mode 100644 index 000000000..fb891a6c9 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LEAKYRELU_CUDA_API_H__ +#define __LEAKYRELU_CUDA_API_H__ + +#include "../leakyrelu.h" + +DESCRIPTOR(nvidia) + +#endif // __LEAKYRELU_CUDA_API_H__ diff --git a/src/infiniop/ops/leakyrelu/operator.cc b/src/infiniop/ops/leakyrelu/operator.cc new file mode 100644 index 000000000..ad6d504a8 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/operator.cc @@ -0,0 +1,164 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/leakyrelu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/leakyrelu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/leakyrelu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/leakyrelu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLeakyreluDescriptor( + infiniopHandle_t handle, + infiniopLeakyreluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + float negative_slope) { + +#define CREATE_LEAKY(CASE, NAMESPACE) \ + case CASE: \ + return op::leakyrelu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + negative_slope) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE_LEAKY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE_LEAKY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE_LEAKY(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_ASCEND_API + CREATE_LEAKY(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + CREATE_LEAKY(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE_LEAKY(INFINI_DEVICE_MOORE, musa); +#endif + } + +#undef CREATE_LEAKY + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, musa); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, void *workspace, size_t workspace_size, + void *y, const void *x, void *stream) { + +#define CALC_LEAKY(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALC_LEAKY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALC_LEAKY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALC_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_KUNLUN_API + CALC_LEAKY(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_ASCEND_API + CALC_LEAKY(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + CALC_LEAKY(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALC_LEAKY(INFINI_DEVICE_MOORE, musa); +#endif + } + +#undef CALC_LEAKY + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc) { + +#define DESTROY_LEAKY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY_LEAKY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY_LEAKY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_KUNLUN_API + DESTROY_LEAKY(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_ASCEND_API + DESTROY_LEAKY(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + DESTROY_LEAKY(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DESTROY_LEAKY(INFINI_DEVICE_MOORE, musa); +#endif + } + +#undef DESTROY_LEAKY + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/test/infiniop/leakyrelu.py b/test/infiniop/leakyrelu.py new file mode 100644 index 000000000..93a8170d2 --- /dev/null +++ b/test/infiniop/leakyrelu.py @@ -0,0 +1,168 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def leakyrelu(output, input, negative_slope): + output.copy_(torch.where(input >= 0, input, input * negative_slope)) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + negative_slope = 0.1 + print( + f"Testing Leakyrelu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} negative_slope:{negative_slope} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + leakyrelu(output.torch_tensor(), input.torch_tensor(), negative_slope) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLeakyreluDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + negative_slope + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLeakyreluWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_leakyrelu(): + check_error( + LIBINFINIOP.infiniopLeakyrelu( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None + ) + ) + + lib_leakyrelu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: leakyrelu(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_leakyrelu(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLeakyreluDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From a8683cd498192f08ba2f308f7243ea2f3a3a97e2 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:11:57 +0800 Subject: [PATCH 06/16] [T1-1-1]: Tanh operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/tanh.h | 24 +++ src/infiniop-test/src/ops/tanh.cpp | 114 +++++++++++++ src/infiniop/ops/tanh/cpu/tanh_cpu.cc | 52 ++++++ src/infiniop/ops/tanh/cpu/tanh_cpu.h | 21 +++ src/infiniop/ops/tanh/cuda/kernel.cuh | 46 +++++ src/infiniop/ops/tanh/metax/tanh_metax.h | 8 + src/infiniop/ops/tanh/metax/tanh_metax.maca | 60 +++++++ src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu | 59 +++++++ src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh | 8 + src/infiniop/ops/tanh/operator.cc | 142 ++++++++++++++++ test/infiniop/tanh.py | 166 +++++++++++++++++++ 11 files changed, 700 insertions(+) create mode 100644 include/infiniop/ops/tanh.h create mode 100644 src/infiniop-test/src/ops/tanh.cpp create mode 100644 src/infiniop/ops/tanh/cpu/tanh_cpu.cc create mode 100644 src/infiniop/ops/tanh/cpu/tanh_cpu.h create mode 100644 src/infiniop/ops/tanh/cuda/kernel.cuh create mode 100644 src/infiniop/ops/tanh/metax/tanh_metax.h create mode 100644 src/infiniop/ops/tanh/metax/tanh_metax.maca create mode 100644 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu create mode 100644 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh create mode 100644 src/infiniop/ops/tanh/operator.cc create mode 100644 test/infiniop/tanh.py diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h new file mode 100644 index 000000000..62974e951 --- /dev/null +++ b/include/infiniop/ops/tanh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TANH_API_H__ +#define __INFINIOP_TANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp new file mode 100644 index 000000000..bb8c6b081 --- /dev/null +++ b/src/infiniop-test/src/ops/tanh.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::tanh { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopTanhDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopTanh( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::tanh diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc new file mode 100644 index 000000000..23a92ed65 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc @@ -0,0 +1,52 @@ +#include "tanh_cpu.h" + +namespace op::tanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::cpu diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h new file mode 100644 index 000000000..5dc73b383 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h @@ -0,0 +1,21 @@ +#ifndef __TANH_CPU_H__ +#define __TANH_CPU_H__ + +#include +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(tanh, cpu) + +namespace op::tanh::cpu { +typedef struct TanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::tanh(input); + } +} TanhOp; +} // namespace op::tanh::cpu + +#endif // __TANH_CPU_H__ diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh new file mode 100644 index 000000000..49605aa93 --- /dev/null +++ b/src/infiniop/ops/tanh/cuda/kernel.cuh @@ -0,0 +1,46 @@ +#ifndef __TANH_CUDA_H__ +#define __TANH_CUDA_H__ + +#include +#include +#include + +namespace op::tanh::cuda { +typedef struct TanhOp { + static constexpr size_t num_inputs = 1; + + __device__ __forceinline__ float tanh_f32_func(float x) const { + return tanhf(x); + } + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = tanh_f32_func(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + float r0 = tanh_f32_func(f0); + float r1 = tanh_f32_func(f1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + float rf = tanh_f32_func(xf); + return __float2bfloat16_rn(rf); + } else if constexpr (std::is_same_v) { + return tanh_f32_func(input); + } else if constexpr (std::is_same_v) { + return std::tanh(input); + } else { + return std::tanh(input); + } + } +} TanhOp; +} // namespace op::tanh::cuda + +#endif // __TANH_CUDA_H__ diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h new file mode 100644 index 000000000..8432a7f0d --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.h @@ -0,0 +1,8 @@ +#ifndef __TANH_METAX_API_H__ +#define __TANH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tanh, metax) + +#endif // __TANH_METAX_API_H__ diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca new file mode 100644 index 000000000..0a01554c4 --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca @@ -0,0 +1,60 @@ +#include "tanh_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::tanh::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::metax diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu new file mode 100644 index 000000000..eeb6c85bf --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tanh_nvidia.cuh" + +namespace op::tanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::nvidia diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh new file mode 100644 index 000000000..cb37b2528 --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TANH_CUDA_API_H__ +#define __TANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tanh, nvidia) + +#endif // __TANH_CUDA_API_H__ diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc new file mode 100644 index 000000000..a5ed56f74 --- /dev/null +++ b/src/infiniop/ops/tanh/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tanh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tanh_metax.h" +#endif + +__C infiniStatus_t infiniopCreateTanhDescriptor( + infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) \ + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTanh( + infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py new file mode 100644 index 000000000..dc6ec46e8 --- /dev/null +++ b/test/infiniop/tanh.py @@ -0,0 +1,166 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def tanh(output, input): + output.copy_(torch.tanh(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + tanh(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTanhDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_tanh(): + check_error( + LIBINFINIOP.infiniopTanh( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_tanh() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From 8bb2121f57498e5f2d3e0f86c46c5263b13ff175 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:12:45 +0800 Subject: [PATCH 07/16] [T1-1-1]: Sigmoid_backward operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/sigmoid_backward.h | 26 +++ .../src/ops/sigmoid_backward.cpp | 122 ++++++++++++ .../cpu/sigmoid_backward_cpu.cc | 54 +++++ .../cpu/sigmoid_backward_cpu.h | 31 +++ .../ops/sigmoid_backward/cuda/kernel.cuh | 62 ++++++ .../metax/sigmoid_backward_metax.h | 8 + .../metax/sigmoid_backward_metax.maca | 62 ++++++ .../nvidia/sigmoid_backward_nvidia.cu | 61 ++++++ .../nvidia/sigmoid_backward_nvidia.cuh | 8 + src/infiniop/ops/sigmoid_backward/operator.cc | 145 ++++++++++++++ test/infiniop/sigmoid_backward.py | 184 ++++++++++++++++++ 11 files changed, 763 insertions(+) create mode 100644 include/infiniop/ops/sigmoid_backward.h create mode 100644 src/infiniop-test/src/ops/sigmoid_backward.cpp create mode 100644 src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc create mode 100644 src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h create mode 100644 src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h create mode 100644 src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca create mode 100644 src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu create mode 100644 src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh create mode 100644 src/infiniop/ops/sigmoid_backward/operator.cc create mode 100644 test/infiniop/sigmoid_backward.py diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h new file mode 100644 index 000000000..2bcc5dee6 --- /dev/null +++ b/include/infiniop/ops/sigmoid_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_SIGMOID_BACKWARD_API_H__ +#define __INFINIOP_SIGMOID_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle, + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t grad_output); + +__C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void* grad_output, + void *stream); + +__C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/sigmoid_backward.cpp b/src/infiniop-test/src/ops/sigmoid_backward.cpp new file mode 100644 index 000000000..116055300 --- /dev/null +++ b/src/infiniop-test/src/ops/sigmoid_backward.cpp @@ -0,0 +1,122 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::sigmoid_backward { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr grad_output; + std::shared_ptr grad_input; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("grad_output") == tensors.end() + || tensors.find("grad_input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->grad_input = tensors["grad_input"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopSigmoidBackwardDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto grad_output = _attributes->grad_output->to(device, device_id); + auto grad_input = _attributes->grad_input->to(device, device_id); + CHECK_OR(infiniopCreateSigmoidBackwardDescriptor(handle, &op_desc, + grad_input->desc(), + input->desc(), + grad_output->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetSigmoidBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopSigmoidBackward(op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_input, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopSigmoidBackward( + op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "grad_output", "grad_input", "ans"}; +} + +std::vector Test::output_names() { + return {"grad_input"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- grad_input: " << _attributes->grad_input->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::sigmoid_backward diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc new file mode 100644 index 000000000..ea3d5e63c --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc @@ -0,0 +1,54 @@ +#include "sigmoid_backward_cpu.h" + +namespace op::sigmoid_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::cpu diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h new file mode 100644 index 000000000..b2f87c2ea --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h @@ -0,0 +1,31 @@ +#ifndef __SIGMOID_BACKWARD_CPU_H__ +#define __SIGMOID_BACKWARD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, cpu) + +namespace op::sigmoid_backward::cpu { +typedef struct SigmoidBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &x, const T &grad_out) const { + using ComputeT = + std::conditional_t || std::is_same_v, + float, T>; + ComputeT xv = utils::cast(x); + ComputeT gov = utils::cast(grad_out); + + // sigmoid(x) = 1 / (1 + exp(-x)) + ComputeT s = static_cast(1) / (static_cast(1) + std::exp(-xv)); + + // grad_input = grad_output * s * (1 - s) + ComputeT gin = gov * s * (static_cast(1) - s); + + return utils::cast(gin); + } +} SigmoidBackwardOp; +} // namespace op::sigmoid_backward::cpu + +#endif // __SIGMOID_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh new file mode 100644 index 000000000..6c10dd26e --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh @@ -0,0 +1,62 @@ +#ifndef __SIGMOID_BACKWARD_CUDA_H__ +#define __SIGMOID_BACKWARD_CUDA_H__ + +#include +#include +#include +#include + +namespace op::sigmoid_backward::cuda { +typedef struct SigmoidBackwardOp { +public: + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ T operator()(const T &x, const T &grad_out) const { + if constexpr (std::is_same_v) { + float2 xf = __half22float2(x); + float2 gf = __half22float2(grad_out); + float2 sf; + sf.x = 1.0f / (1.0f + __expf(-xf.x)); + sf.y = 1.0f / (1.0f + __expf(-xf.y)); + float2 gr; + gr.x = gf.x * sf.x * (1.0f - sf.x); + gr.y = gf.y * sf.y * (1.0f - sf.y); + return __float22half2_rn(gr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + float gf = __half2float(grad_out); + float s = 1.0f / (1.0f + __expf(-xf)); + float gr = gf * s * (1.0f - s); + return __float2half_rn(gr); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(x)); + float f1 = __bfloat162float(__high2bfloat16(x)); + float g0 = __bfloat162float(__low2bfloat16(grad_out)); + float g1 = __bfloat162float(__high2bfloat16(grad_out)); + float s0 = 1.0f / (1.0f + __expf(-f0)); + float s1 = 1.0f / (1.0f + __expf(-f1)); + float r0 = g0 * s0 * (1.0f - s0); + float r1 = g1 * s1 * (1.0f - s1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float gf = __bfloat162float(grad_out); + float s = 1.0f / (1.0f + __expf(-xf)); + float gr = gf * s * (1.0f - s); + return __float2bfloat16_rn(gr); + } else if constexpr (std::is_same_v) { + float s = 1.0f / (1.0f + __expf(-x)); + return grad_out * s * (1.0f - s); + } else if constexpr (std::is_same_v) { + double s = 1.0 / (1.0 + std::exp(-x)); + return grad_out * s * (1.0 - s); + } else { + auto s = static_cast(1) / (static_cast(1) + std::exp(-static_cast(x))); + return static_cast(static_cast(grad_out) * s * (1.0f - s)); + } + } +} SigmoidBackwardOp; +} // namespace op::sigmoid_backward::cuda + +#endif // __SIGMOID_BACKWARD_CUDA_H__ diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h new file mode 100644 index 000000000..fa1708559 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __SIGMOID_BACKWARD_METAX_API_H__ +#define __SIGMOID_BACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, metax) + +#endif // __SIGMOID_BACKWARD_METAX_API_H__ diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca new file mode 100644 index 000000000..ed99ac65d --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca @@ -0,0 +1,62 @@ +#include "sigmoid_backward_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::sigmoid_backward::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::metax diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu new file mode 100644 index 000000000..e7e604af4 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sigmoid_backward_nvidia.cuh" + +namespace op::sigmoid_backward::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::nvidia diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh new file mode 100644 index 000000000..822f870fe --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIGMOID_BACKWARD_CUDA_API_H__ +#define __SIGMOID_BACKWARD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, nvidia) + +#endif // __SIGMOID_BACKWARD_CUDA_API_H__ diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc new file mode 100644 index 000000000..f30a646d0 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sigmoid_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sigmoid_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sigmoid_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sigmoid_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor( + infiniopHandle_t handle, + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t grad_output_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sigmoid_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + {input_desc, \ + grad_output_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSigmoidBackward( + infiniopSigmoidBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/sigmoid_backward.py b/test/infiniop/sigmoid_backward.py new file mode 100644 index 000000000..813791aa8 --- /dev/null +++ b/test/infiniop/sigmoid_backward.py @@ -0,0 +1,184 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, input_stride, grad_output_stride, grad_input_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + INPLACE_GRAD_OUTPUT = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, + Inplace.INPLACE_GRAD_OUTPUT, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sigmoid_backward(grad_input, input_tensor, grad_output): + sigmoid_input = torch.sigmoid(input_tensor) + grad_input.copy_(grad_output * sigmoid_input * (1 - sigmoid_input)) + + +def test( + handle, + device, + shape, + input_stride=None, + grad_output_stride=None, + grad_input_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input_tensor = TestTensor(shape, input_stride, dtype, device) + grad_output = TestTensor(shape, grad_output_stride, dtype, device) + + if inplace == Inplace.INPLACE_INPUT: + if input_stride != grad_input_stride: + return + grad_input = input_tensor + elif inplace == Inplace.INPLACE_GRAD_OUTPUT: + if grad_input_stride != grad_output_stride: + return + grad_input = grad_output + else: + grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="ones") + + if grad_input.is_broadcast(): + return + + print( + f"Testing SigmoidBackward on {InfiniDeviceNames[device]} with shape:{shape} " + f"input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + sigmoid_backward(grad_input.torch_tensor(), input_tensor.torch_tensor(), grad_output.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSigmoidBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input.descriptor, + input_tensor.descriptor, + grad_output.descriptor, + ) + ) + + for tensor in [input_tensor, grad_output, grad_input]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSigmoidBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_input.device) + + def lib_sigmoid_backward(): + check_error( + LIBINFINIOP.infiniopSigmoidBackward( + descriptor, + workspace.data(), + workspace.size(), + grad_input.data(), + input_tensor.data(), + grad_output.data(), + None, + ) + ) + + lib_sigmoid_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sigmoid_backward(grad_input.torch_tensor(), input_tensor.torch_tensor(), grad_output.torch_tensor()), + device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sigmoid_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySigmoidBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From f38ea0d3f074520d01a7e2aadbcc8a63ed3984e4 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:13:25 +0800 Subject: [PATCH 08/16] [T1-1-1]: Hardswish operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/hardswish.h | 24 +++ src/infiniop-test/src/ops/hardswish.cpp | 114 ++++++++++++ .../ops/hardswish/cpu/hardswish_cpu.cc | 52 ++++++ .../ops/hardswish/cpu/hardswish_cpu.h | 30 ++++ src/infiniop/ops/hardswish/cuda/kernel.cuh | 56 ++++++ .../ops/hardswish/metax/hardswish_metax.h | 8 + .../ops/hardswish/metax/hardswish_metax.maca | 60 +++++++ .../ops/hardswish/nvidia/hardswish_nvidia.cu | 59 +++++++ .../ops/hardswish/nvidia/hardswish_nvidia.cuh | 8 + src/infiniop/ops/hardswish/operator.cc | 142 +++++++++++++++ test/infiniop/hardswish.py | 167 ++++++++++++++++++ 11 files changed, 720 insertions(+) create mode 100644 include/infiniop/ops/hardswish.h create mode 100644 src/infiniop-test/src/ops/hardswish.cpp create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc create mode 100644 src/infiniop/ops/hardswish/cpu/hardswish_cpu.h create mode 100644 src/infiniop/ops/hardswish/cuda/kernel.cuh create mode 100644 src/infiniop/ops/hardswish/metax/hardswish_metax.h create mode 100644 src/infiniop/ops/hardswish/metax/hardswish_metax.maca create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu create mode 100644 src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh create mode 100644 src/infiniop/ops/hardswish/operator.cc create mode 100644 test/infiniop/hardswish.py diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h new file mode 100644 index 000000000..79a7c93ea --- /dev/null +++ b/include/infiniop/ops/hardswish.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_HARDSWISH_API_H__ +#define __INFINIOP_HARDSWISH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle, + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp new file mode 100644 index 000000000..25b161ccf --- /dev/null +++ b/src/infiniop-test/src/ops/hardswish.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::hardswish { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopHardswishDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopHardswish( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::hardswish diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc new file mode 100644 index 000000000..e7b68508a --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc @@ -0,0 +1,52 @@ +#include "hardswish_cpu.h" + +namespace op::hardswish::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::cpu diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h new file mode 100644 index 000000000..a42009017 --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -0,0 +1,30 @@ +#ifndef __HARDSWISH_CPU_H__ +#define __HARDSWISH_CPU_H__ + +#include +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, cpu) + +namespace op::hardswish::cpu { +typedef struct HardswishOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + if constexpr (std::is_integral_v) { + return static_cast(0); + } else { + // x * clamp(x + 3, 0, 6) / 6 + auto x = static_cast(input); + double y = x + 3.0; + y = std::min(std::max(y, 0.0), 6.0); + double out = x * (y / 6.0); + return static_cast(out); + } + } +} HardswishOp; +} // namespace op::hardswish::cpu + +#endif // __HARDSWISH_CPU_H__ diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh new file mode 100644 index 000000000..be22e5faa --- /dev/null +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -0,0 +1,56 @@ +#ifndef __HARDSWISH_CUDA_H__ +#define __HARDSWISH_CUDA_H__ + +#include +#include +#include + +namespace op::hardswish::cuda { + +typedef struct HardswishOp { + static constexpr size_t num_inputs = 1; + + // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 + __device__ __forceinline__ float hswish_f32(float x) const { + float y = x + 3.0f; + y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y); + return x * (y * (1.0f / 6.0f)); + } + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2( + hswish_f32(vf.x), + hswish_f32(vf.y) + ); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = hswish_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + return __float2bfloat16_rz(hswish_f32(xf)); + } else if constexpr (std::is_same_v) { + return hswish_f32(input); + } else if constexpr (std::is_same_v) { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } else { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } + } +} HardswishOp; + +} // namespace op::hardswish::cuda + +#endif // __HARDSWISH_CUDA_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h new file mode 100644 index 000000000..16b131aa9 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_METAX_API_H__ +#define __HARDSWISH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, metax) + +#endif // __HARDSWISH_METAX_API_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca new file mode 100644 index 000000000..e53b94357 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca @@ -0,0 +1,60 @@ +#include "hardswish_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hardswish::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::metax diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu new file mode 100644 index 000000000..0aff55cd2 --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardswish_nvidia.cuh" + +namespace op::hardswish::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::nvidia diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh new file mode 100644 index 000000000..f869ad52f --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_CUDA_API_H__ +#define __HARDSWISH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hardswish, nvidia) + +#endif // __HARDSWISH_CUDA_API_H__ diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc new file mode 100644 index 000000000..7787c799b --- /dev/null +++ b/src/infiniop/ops/hardswish/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardswish.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardswish_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hardswish_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hardswish_metax.h" +#endif + +__C infiniStatus_t infiniopCreateHardswishDescriptor( + infiniopHandle_t handle, + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardswish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) \ + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopHardswish( + infiniopHardswishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py new file mode 100644 index 000000000..424b30567 --- /dev/null +++ b/test/infiniop/hardswish.py @@ -0,0 +1,167 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def hardswish(output, input): + output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + hardswish(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateHardswishDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardswishWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_hardswish(): + check_error( + LIBINFINIOP.infiniopHardswish( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_hardswish() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From 7c848685877e27582c2bef535ac1ebada7cbf613 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:14:16 +0800 Subject: [PATCH 09/16] [T1-1-1]: Cast operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/cast.h | 24 ++ src/infiniop-test/src/ops/cast.cpp | 122 ++++++++++ src/infiniop/ops/cast/cast.h | 48 ++++ src/infiniop/ops/cast/cpu/cast_cpu.cc | 135 ++++++++++ src/infiniop/ops/cast/cpu/cast_cpu.h | 8 + src/infiniop/ops/cast/cuda/kernel.cuh | 75 ++++++ src/infiniop/ops/cast/info.h | 58 +++++ src/infiniop/ops/cast/metax/cast_metax.h | 8 + src/infiniop/ops/cast/metax/cast_metax.maca | 201 +++++++++++++++ src/infiniop/ops/cast/nvidia/cast_nvidia.cu | 205 ++++++++++++++++ src/infiniop/ops/cast/nvidia/cast_nvidia.cuh | 8 + src/infiniop/ops/cast/operator.cc | 142 +++++++++++ test/infiniop/cast.py | 244 +++++++++++++++++++ 13 files changed, 1278 insertions(+) create mode 100644 include/infiniop/ops/cast.h create mode 100644 src/infiniop-test/src/ops/cast.cpp create mode 100644 src/infiniop/ops/cast/cast.h create mode 100644 src/infiniop/ops/cast/cpu/cast_cpu.cc create mode 100644 src/infiniop/ops/cast/cpu/cast_cpu.h create mode 100644 src/infiniop/ops/cast/cuda/kernel.cuh create mode 100644 src/infiniop/ops/cast/info.h create mode 100644 src/infiniop/ops/cast/metax/cast_metax.h create mode 100644 src/infiniop/ops/cast/metax/cast_metax.maca create mode 100644 src/infiniop/ops/cast/nvidia/cast_nvidia.cu create mode 100644 src/infiniop/ops/cast/nvidia/cast_nvidia.cuh create mode 100644 src/infiniop/ops/cast/operator.cc create mode 100644 test/infiniop/cast.py diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h new file mode 100644 index 000000000..82b41490e --- /dev/null +++ b/include/infiniop/ops/cast.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_CAST_API_H__ +#define __INFINIOP_CAST_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCastDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle, + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/cast.cpp b/src/infiniop-test/src/ops/cast.cpp new file mode 100644 index 000000000..d91f5eb6c --- /dev/null +++ b/src/infiniop-test/src/ops/cast.cpp @@ -0,0 +1,122 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cast { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_I32) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I64) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_F64) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopCastDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopCast(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopCast( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::cast diff --git a/src/infiniop/ops/cast/cast.h b/src/infiniop/ops/cast/cast.h new file mode 100644 index 000000000..5e66997cc --- /dev/null +++ b/src/infiniop/ops/cast/cast.h @@ -0,0 +1,48 @@ +#ifndef __CAST_H__ +#define __CAST_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::cast::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + \ + CastInfo _info; \ + size_t _min_workspace_size; \ + \ + Descriptor( \ + CastInfo info, \ + size_t min_workspace_size, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _min_workspace_size(min_workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t in_desc); \ + \ + size_t workspaceSize() const; \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __CAST_H__ diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc new file mode 100644 index 000000000..9a8bcc5a5 --- /dev/null +++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc @@ -0,0 +1,135 @@ +#include "cast_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../info.h" +#include "infinicore.h" +#include + +namespace op::cast::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc) { + + auto handle = reinterpret_cast(handle_); + + auto info_r = CastInfo::create(out_desc, in_desc); + CHECK_RESULT(info_r); + + *desc_ptr = new Descriptor( + info_r.take(), + 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline void cpu_cast_impl_incremental( + void *output, const void *input, const op::cast::CastInfo &info) { + + const size_t ndim = info.shape.size(); + const size_t n = info.n; + + auto out_base = reinterpret_cast(output); + auto in_base = reinterpret_cast(input); + + const std::vector &shape = info.shape; + const std::vector &in_stride = info.in_stride; + const std::vector &out_stride = info.out_stride; + + if (n == 0) return; + + std::vector idx(ndim, 0); + ptrdiff_t in_off = 0; + ptrdiff_t out_off = 0; + + for (size_t it = 0; it < n; ++it) { + const Tin *in_elem = in_base + in_off; + Tout *out_elem = out_base + out_off; + *out_elem = utils::cast(*in_elem); + + for (int d = static_cast(ndim) - 1; d >= 0; --d) { + idx[d] += 1; + if (in_stride[d] != 0) in_off += in_stride[d]; + if (out_stride[d] != 0) out_off += out_stride[d]; + + if (idx[d] < shape[d]) { + break; + } else { + idx[d] = 0; + if (in_stride[d] != 0) in_off -= static_cast(shape[d]) * in_stride[d]; + if (out_stride[d] != 0) out_off -= static_cast(shape[d]) * out_stride[d]; + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (output == const_cast(input)) { + return INFINI_STATUS_BAD_PARAM; // or INFINI_STATUS_INPLACE_NOT_SUPPORTED + } + + #define CASE_OUT(DT_OUT, TOUT) \ + case DT_OUT: { \ + switch (_info.dt_in) { \ + case INFINI_DTYPE_I32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_I64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_U32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_U64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F16: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + break; \ + } + + switch (_info.dt_out) { + CASE_OUT(INFINI_DTYPE_I32, int32_t); + CASE_OUT(INFINI_DTYPE_I64, int64_t); + CASE_OUT(INFINI_DTYPE_U32, uint32_t); + CASE_OUT(INFINI_DTYPE_U64, uint64_t); + CASE_OUT(INFINI_DTYPE_F16, fp16_t); + CASE_OUT(INFINI_DTYPE_F32, float); + CASE_OUT(INFINI_DTYPE_F64, double); + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + #undef CASE_OUT + + return INFINI_STATUS_SUCCESS; +} + + +} // namespace op::cast::cpu diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.h b/src/infiniop/ops/cast/cpu/cast_cpu.h new file mode 100644 index 000000000..ca929a694 --- /dev/null +++ b/src/infiniop/ops/cast/cpu/cast_cpu.h @@ -0,0 +1,8 @@ +#ifndef __CAST_CPU_H__ +#define __CAST_CPU_H__ + +#include "../cast.h" + +DESCRIPTOR(cpu) + +#endif // __CAST_CPU_H__ diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh new file mode 100644 index 000000000..eee801b12 --- /dev/null +++ b/src/infiniop/ops/cast/cuda/kernel.cuh @@ -0,0 +1,75 @@ +#ifndef __CAST_CUDA_KERNEL_CUH__ +#define __CAST_CUDA_KERNEL_CUH__ + +#include +#include +#include + +template +__device__ __forceinline__ Tout device_cast(const Tin &v) { + if constexpr (std::is_same_v) { + float f; + if constexpr (std::is_same_v) { + f = __half2float(v); + } else { + f = static_cast(v); + } + return __float2half_rn(f); + } else if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return __half2float(v); + } else { + return static_cast(v); + } + } else if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return static_cast(__half2float(v)); + } else { + return static_cast(v); + } + } else { // integer outputs + // convert via double/float then to integer (truncate) + if constexpr (std::is_same_v) { + float f = __half2float(v); + return static_cast(f); + } else { + return static_cast(v); + } + } +} + +template +__global__ void cast_kernel( + ToutDev *__restrict__ out, + const TinDev *__restrict__ in, + size_t n, + const size_t *__restrict__ shape, + const size_t *__restrict__ div, + const long long *__restrict__ in_stride, + const long long *__restrict__ out_stride, + int ndim) { + + size_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t grid_stride = static_cast(blockDim.x) * gridDim.x; + + for (size_t linear = gid; linear < n; linear += grid_stride) { + unsigned long long rem = linear; + long long in_off = 0; + long long out_off = 0; + for (int d = 0; d < ndim; ++d) { + unsigned long long idx_d = 0; + size_t divisor = div[d]; + if (divisor != 0) { + idx_d = rem / divisor; + rem = rem % divisor; + } else { + idx_d = 0; + } + if (in_stride[d] != 0) in_off += static_cast(idx_d) * in_stride[d]; + if (out_stride[d] != 0) out_off += static_cast(idx_d) * out_stride[d]; + } + out[static_cast(out_off)] = device_cast(in[static_cast(in_off)]); + } +} + +#endif // __CAST_CUDA_KERNEL_CUH__ diff --git a/src/infiniop/ops/cast/info.h b/src/infiniop/ops/cast/info.h new file mode 100644 index 000000000..4283a8224 --- /dev/null +++ b/src/infiniop/ops/cast/info.h @@ -0,0 +1,58 @@ +#ifndef __CAST_INFO_H__ +#define __CAST_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::cast { + +class CastInfo { + CastInfo() = default; + +public: + infiniDtype_t dt_in; + infiniDtype_t dt_out; + std::vector shape; + std::vector in_stride; + std::vector out_stride; + size_t n; + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc) { + + auto dt_out = out_desc->dtype(); + auto dt_in = in_desc->dtype(); + + CHECK_DTYPE(dt_in, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_DTYPE(dt_out, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_OR_RETURN(out_desc->ndim() == in_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); + for (size_t i = 0; i < out_desc->ndim(); ++i) { + CHECK_OR_RETURN(out_desc->dim(i) == in_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE); + } + + size_t n = 1; + for (size_t i = 0; i < in_desc->ndim(); ++i) n *= static_cast(in_desc->dim(i)); + + return utils::Result(CastInfo{ + dt_in, + dt_out, + out_desc->shape(), + in_desc->strides(), + out_desc->strides(), + n, + }); + } +}; + +} // namespace op::cast + +#endif // __CAST_INFO_H__ diff --git a/src/infiniop/ops/cast/metax/cast_metax.h b/src/infiniop/ops/cast/metax/cast_metax.h new file mode 100644 index 000000000..5ba92911e --- /dev/null +++ b/src/infiniop/ops/cast/metax/cast_metax.h @@ -0,0 +1,8 @@ +#ifndef __CAST_METAX_API_H__ +#define __CAST_METAX_API_H__ + +#include "../cast.h" + +DESCRIPTOR(metax) + +#endif // __CAST_METAX_API_H__ diff --git a/src/infiniop/ops/cast/metax/cast_metax.maca b/src/infiniop/ops/cast/metax/cast_metax.maca new file mode 100644 index 000000000..4b2103da3 --- /dev/null +++ b/src/infiniop/ops/cast/metax/cast_metax.maca @@ -0,0 +1,201 @@ +#include "../cuda/kernel.cuh" +#include "../../../devices/metax/metax_common.h" +#include "../cast.h" +#include "cast_metax.h" +#include "../info.h" + +namespace op::cast::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +template struct MapHcType { using Type = T; }; +template <> struct MapHcType { using Type = half; }; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc) { + auto handle = reinterpret_cast(handle_); + + auto info_r = CastInfo::create(out_desc, in_desc); + CHECK_RESULT(info_r); + auto info = info_r.take(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info, + workspace_size, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline infiniStatus_t metax_cast_impl_incremental( + void *output_, const void *input_, + const op::cast::CastInfo &info, + void *stream_) { + + int bs = 256, grid = 0; + hcError_t propErr; + int device_id_local = 0; + using DevTout = typename MapHcType::Type; + using DevTin = typename MapHcType::Type; + + auto out_dev = reinterpret_cast(output_); + auto in_dev = reinterpret_cast(input_); + auto stream = reinterpret_cast(stream_); + + int ndim = static_cast(info.shape.size()); + if (ndim == 0) { + return INFINI_STATUS_SUCCESS; + } + + std::vector h_shape(info.shape.begin(), info.shape.end()); + std::vector h_div(ndim); + h_div[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + h_div[d] = h_div[d + 1] * h_shape[d + 1]; + } + + std::vector h_in_stride(ndim), h_out_stride(ndim); + for (int d = 0; d < ndim; ++d) { + h_in_stride[d] = static_cast(info.in_stride[d]); + h_out_stride[d] = static_cast(info.out_stride[d]); + } + + size_t *d_shape = nullptr; + size_t *d_div = nullptr; + long long *d_in_stride = nullptr; + long long *d_out_stride = nullptr; + + hcError_t err = hcSuccess; + err = hcMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); + if (err != hcSuccess) goto cleanup; + + err = hcMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + + device_id_local = 0; + propErr = hcGetDevice(&device_id_local); + if (propErr == hcSuccess) { + hcDeviceProp_t prop; + if (hcGetDeviceProperties(&prop, device_id_local) == hcSuccess) { + bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); + } else { + if (bs > 256) bs = 256; + } + } else { + if (bs > 256) bs = 256; + } + + if (bs <= 0) bs = 256; + grid = static_cast((info.n + bs - 1) / bs); + if (grid <= 0) grid = 1; + + cast_kernel<<>>( + out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim); + + err = hcGetLastError(); + if (err != hcSuccess) goto cleanup; + + err = hcStreamSynchronize(stream); + if (err != hcSuccess) goto cleanup; + + hcFree(d_shape); + hcFree(d_div); + hcFree(d_in_stride); + hcFree(d_out_stride); + return INFINI_STATUS_SUCCESS; + +cleanup: + hcFree(d_shape); + hcFree(d_div); + hcFree(d_in_stride); + hcFree(d_out_stride); + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (output == const_cast(input)) { + return INFINI_STATUS_BAD_PARAM; + } + + #define CASE_OUT(DT_OUT, TOUT) \ + case DT_OUT: { \ + switch (_info.dt_in) { \ + case INFINI_DTYPE_I32: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_I64: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U32: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U64: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F16: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F32: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F64: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + break; \ + } + + switch (_info.dt_out) { + CASE_OUT(INFINI_DTYPE_I32, int32_t); + CASE_OUT(INFINI_DTYPE_I64, int64_t); + CASE_OUT(INFINI_DTYPE_U32, uint32_t); + CASE_OUT(INFINI_DTYPE_U64, uint64_t); + CASE_OUT(INFINI_DTYPE_F16, fp16_t); + CASE_OUT(INFINI_DTYPE_F32, float); + CASE_OUT(INFINI_DTYPE_F64, double); + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + #undef CASE_OUT + return INFINI_STATUS_SUCCESS; +} + +}; // namespace op::cast::metax diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu new file mode 100644 index 000000000..2ad20c203 --- /dev/null +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu @@ -0,0 +1,205 @@ +#include "../cuda/kernel.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../cast.h" +#include "cast_nvidia.cuh" +#include "../info.h" +#include +#include +#include +#include + +namespace op::cast::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +template struct MapCudaType { using Type = T; }; +template <> struct MapCudaType { using Type = half; }; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc) { + auto handle = reinterpret_cast(handle_); + + auto info_r = CastInfo::create(out_desc, in_desc); + CHECK_RESULT(info_r); + auto info = info_r.take(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info, + workspace_size, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline infiniStatus_t cuda_cast_impl_incremental( + void *output_, const void *input_, + const op::cast::CastInfo &info, + void *stream_) { + + int bs = 256, grid = 0; + cudaError_t propErr; + int device_id_local = 0; + using DevTout = typename MapCudaType::Type; + using DevTin = typename MapCudaType::Type; + + auto out_dev = reinterpret_cast(output_); + auto in_dev = reinterpret_cast(input_); + auto stream = reinterpret_cast(stream_); + + int ndim = static_cast(info.shape.size()); + if (ndim == 0) { + return INFINI_STATUS_SUCCESS; + } + + std::vector h_shape(info.shape.begin(), info.shape.end()); + std::vector h_div(ndim); + h_div[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + h_div[d] = h_div[d + 1] * h_shape[d + 1]; + } + + std::vector h_in_stride(ndim), h_out_stride(ndim); + for (int d = 0; d < ndim; ++d) { + h_in_stride[d] = static_cast(info.in_stride[d]); + h_out_stride[d] = static_cast(info.out_stride[d]); + } + + size_t *d_shape = nullptr; + size_t *d_div = nullptr; + long long *d_in_stride = nullptr; + long long *d_out_stride = nullptr; + + cudaError_t err = cudaSuccess; + err = cudaMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); + if (err != cudaSuccess) goto cleanup; + err = cudaMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); + if (err != cudaSuccess) goto cleanup; + err = cudaMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); + if (err != cudaSuccess) goto cleanup; + err = cudaMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); + if (err != cudaSuccess) goto cleanup; + + err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) goto cleanup; + err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) goto cleanup; + err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) goto cleanup; + err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) goto cleanup; + + device_id_local = 0; + propErr = cudaGetDevice(&device_id_local); + if (propErr == cudaSuccess) { + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) { + bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); + } else { + if (bs > 256) bs = 256; + } + } else { + if (bs > 256) bs = 256; + } + + if (bs <= 0) bs = 256; + grid = static_cast((info.n + bs - 1) / bs); + if (grid <= 0) grid = 1; + + cast_kernel<<>>( + out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim); + + err = cudaGetLastError(); + if (err != cudaSuccess) goto cleanup; + + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) goto cleanup; + + cudaFree(d_shape); + cudaFree(d_div); + cudaFree(d_in_stride); + cudaFree(d_out_stride); + return INFINI_STATUS_SUCCESS; + +cleanup: + cudaFree(d_shape); + cudaFree(d_div); + cudaFree(d_in_stride); + cudaFree(d_out_stride); + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (output == const_cast(input)) { + return INFINI_STATUS_BAD_PARAM; + } + + #define CASE_OUT(DT_OUT, TOUT) \ + case DT_OUT: { \ + switch (_info.dt_in) { \ + case INFINI_DTYPE_I32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_I64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F16: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + break; \ + } + + switch (_info.dt_out) { + CASE_OUT(INFINI_DTYPE_I32, int32_t); + CASE_OUT(INFINI_DTYPE_I64, int64_t); + CASE_OUT(INFINI_DTYPE_U32, uint32_t); + CASE_OUT(INFINI_DTYPE_U64, uint64_t); + CASE_OUT(INFINI_DTYPE_F16, fp16_t); + CASE_OUT(INFINI_DTYPE_F32, float); + CASE_OUT(INFINI_DTYPE_F64, double); + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + #undef CASE_OUT + return INFINI_STATUS_SUCCESS; +} + +}; // namespace op::cast::nvidia diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh new file mode 100644 index 000000000..032e1fb2e --- /dev/null +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CAST_CUDA_API_H__ +#define __CAST_CUDA_API_H__ + +#include "../cast.h" + +DESCRIPTOR(nvidia) + +#endif // __CAST_CUDA_API_H__ diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc new file mode 100644 index 000000000..fc3aef4ad --- /dev/null +++ b/src/infiniop/ops/cast/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cast.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cast_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cast_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cast_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCastDescriptor( + infiniopHandle_t handle, + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cast::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc) \ + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCast( + infiniopCastDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/cast.py b/test/infiniop/cast.py new file mode 100644 index 000000000..87b572741 --- /dev/null +++ b/test/infiniop/cast.py @@ -0,0 +1,244 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, + to_torch_dtype, + torch_device_map +) +import itertools + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +_TEST_CASES = [ + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +_INTEGER_DTYPES = [ + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.U32, + InfiniDtype.U64, +] + +_FLOAT_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +def is_supported_dt(inf_dt): + try: + td = to_torch_dtype(inf_dt, compatability_mode=True) + _ = torch.empty((1,), dtype=td, device="cpu") + return True + except Exception: + return False + +_TOLERANCE_MAP = { + ("float", "float"): {"atol": 1e-3, "rtol": 1e-3}, + ("int", "float"): {"atol": 1.0, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def _is_integer_dtype(inf_dt): + return inf_dt in _INTEGER_DTYPES + + +def _is_float_dtype(inf_dt): + return inf_dt in _FLOAT_DTYPES + + +def _is_unsigned_dtype(inf_dt): + return inf_dt in (InfiniDtype.U32, InfiniDtype.U64) + + +def reference_cast_torch(output_tensor, input_tensor): + converted = input_tensor.to(dtype=output_tensor.dtype, device=output_tensor.device).clone() + output_tensor.copy_(converted) + + +def make_integer_torch_tensor(shape, inf_dt, device): + use_compatibility = _is_unsigned_dtype(inf_dt) + + if inf_dt == InfiniDtype.I32: + low, high, dtype = -2000, 2000, torch.int32 + elif inf_dt == InfiniDtype.I64: + low, high, dtype = -2048, 2048, torch.int64 + elif inf_dt == InfiniDtype.U32: + low, high, dtype = 0, 2000, torch.int32 + elif inf_dt == InfiniDtype.U64: + low, high, dtype = 0, 2048, torch.int64 + else: + low, high, dtype = 0, 1, torch.int64 + + dev = torch_device_map[device] + + t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev) + + target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility) + if t.dtype != target_torch_dt: + t = t.to(dtype=target_torch_dt) + + return t + + +def test( + handle, + device, + shape, + in_stride, + out_stride, + dtype_pair, + sync=None, +): + in_dt, out_dt = dtype_pair + + if not is_supported_dt(in_dt) or not is_supported_dt(out_dt): + print(f"Skipping test for in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} because dtype not supported on this platform") + return + + try: + if _is_integer_dtype(in_dt): + in_torch = make_integer_torch_tensor(shape, in_dt, device) + input = TestTensor.from_torch(in_torch, in_dt, device) + else: + input = TestTensor(shape, in_stride, in_dt, device, mode="random") + + output = TestTensor(shape, out_stride, out_dt, device, mode="zeros") + + if output.is_broadcast(): + return + + print(f"Testing Cast on {InfiniDeviceNames[device]} shape={shape} in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} in_stride={in_stride} out_stride={out_stride}") + + reference_cast_torch(output.actual_tensor(), input.torch_tensor()) + + expected = output.actual_tensor().clone() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCastDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + input.destroy_desc() + output.destroy_desc() + + workspace_size = c_uint64(0) + check_error(LIBINFINIOP.infiniopGetCastWorkspaceSize(descriptor, ctypes.byref(workspace_size))) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_cast(): + check_error( + LIBINFINIOP.infiniopCast( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_cast() + + actual = output.actual_tensor() + + if _is_integer_dtype(in_dt) and _is_float_dtype(out_dt): + tol = _TOLERANCE_MAP[("int", "float")] + atol, rtol = tol["atol"], tol["rtol"] + elif _is_float_dtype(in_dt) and _is_float_dtype(out_dt): + tol = _TOLERANCE_MAP[("float", "float")] + atol, rtol = tol["atol"], tol["rtol"] + else: + atol, rtol = 0, 0 + + if DEBUG: + debug(actual, expected, atol=atol, rtol=rtol) + + assert torch.allclose(actual, expected, atol=atol, rtol=rtol), \ + f"Mismatch for in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} shape={shape}" + + if PROFILE: + profile_operation("PyTorch", lambda: reference_cast_torch(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cast(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyCastDescriptor(descriptor)) + + except RuntimeError as e: + if "not implemented for 'UInt32'" in str(e) or "not implemented for 'UInt64'" in str(e): + #print(f"Skipping unsupported operation: {e}") + return False + else: + raise + + +def main(): + args = get_args() + global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + integer_pairs = itertools.product(_INTEGER_DTYPES, _INTEGER_DTYPES) + float_pairs = itertools.product(_FLOAT_DTYPES, _FLOAT_DTYPES) + int_to_float_pairs = itertools.product(_INTEGER_DTYPES, _FLOAT_DTYPES) + + all_pairs = list(set(itertools.chain(integer_pairs, float_pairs, int_to_float_pairs))) + + supported_pairs = [] + skipped_pairs = [] + for pair in all_pairs: + in_dt, out_dt = pair + if is_supported_dt(in_dt) and is_supported_dt(out_dt): + supported_pairs.append(pair) + else: + skipped_pairs.append(pair) + + print(f"Supported dtype pairs: {[(InfiniDtypeNames[in_d], InfiniDtypeNames[out_d]) for in_d, out_d in supported_pairs]}") + if skipped_pairs: + print(f"Warning: skipping unsupported dtype pairs: {[(InfiniDtypeNames[in_d], InfiniDtypeNames[out_d]) for in_d, out_d in skipped_pairs]}") + + devices = get_test_devices(args) + + for device in devices: + test_operator(device, test, _TEST_CASES, supported_pairs) + + print("\033[92mAll cast tests passed!\033[0m") + + +if __name__ == "__main__": + main() From 53bfa538eae8054a151aebe117f18e17f105f1c5 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 17 Aug 2025 23:14:42 +0800 Subject: [PATCH 10/16] [T1-1-1]: Where operator with cpu nvidia metax iluvatar and test --- include/infiniop/ops/where.h | 28 ++ src/infiniop-test/src/ops/cast.cpp | 24 +- src/infiniop-test/src/ops/cos.cpp | 14 +- src/infiniop-test/src/ops/exp.cpp | 15 +- src/infiniop-test/src/ops/hardswish.cpp | 24 +- src/infiniop-test/src/ops/leakyrelu.cpp | 28 +- .../src/ops/sigmoid_backward.cpp | 28 +- src/infiniop-test/src/ops/sin.cpp | 14 +- src/infiniop-test/src/ops/tanh.cpp | 24 +- src/infiniop-test/src/ops/where.cpp | 151 +++++++++ src/infiniop/ops/cast/cpu/cast_cpu.cc | 83 ++--- src/infiniop/ops/cast/cuda/kernel.cuh | 8 +- src/infiniop/ops/cast/info.h | 20 +- src/infiniop/ops/cast/nvidia/cast_nvidia.cu | 144 +++++---- src/infiniop/ops/cast/operator.cc | 16 +- src/infiniop/ops/cos/cpu/cos_cpu.h | 2 +- src/infiniop/ops/cos/cuda/kernel.cuh | 69 +++-- src/infiniop/ops/cos/nvidia/cos_nvidia.cu | 2 +- src/infiniop/ops/cos/operator.cc | 20 +- src/infiniop/ops/exp/cpu/exp_cpu.h | 2 +- src/infiniop/ops/exp/cuda/kernel.cuh | 54 ++-- src/infiniop/ops/exp/nvidia/exp_nvidia.cu | 2 +- src/infiniop/ops/exp/operator.cc | 20 +- .../ops/hardswish/cpu/hardswish_cpu.h | 2 +- src/infiniop/ops/hardswish/cuda/kernel.cuh | 79 +++-- .../ops/hardswish/nvidia/hardswish_nvidia.cu | 2 +- src/infiniop/ops/hardswish/operator.cc | 28 +- .../ops/leakyrelu/cpu/leakyrelu_cpu.cc | 40 ++- src/infiniop/ops/leakyrelu/cuda/kernel.cuh | 10 +- src/infiniop/ops/leakyrelu/info.h | 7 +- .../ops/leakyrelu/nvidia/leakyrelu_nvidia.cu | 105 ++++--- src/infiniop/ops/leakyrelu/operator.cc | 28 +- .../cpu/sigmoid_backward_cpu.h | 5 +- .../ops/sigmoid_backward/cuda/kernel.cuh | 6 +- src/infiniop/ops/sigmoid_backward/operator.cc | 22 +- src/infiniop/ops/sin/cpu/sin_cpu.h | 2 +- src/infiniop/ops/sin/cuda/kernel.cuh | 52 ++-- src/infiniop/ops/sin/nvidia/sin_nvidia.cu | 2 +- src/infiniop/ops/sin/operator.cc | 20 +- src/infiniop/ops/tanh/cpu/tanh_cpu.h | 2 +- src/infiniop/ops/tanh/cuda/kernel.cuh | 66 ++-- src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu | 2 +- src/infiniop/ops/tanh/operator.cc | 20 +- src/infiniop/ops/where/cpu/where_cpu.cc | 84 +++++ src/infiniop/ops/where/cpu/where_cpu.h | 19 ++ src/infiniop/ops/where/cuda/kernel.cuh | 15 + src/infiniop/ops/where/metax/where_metax.h | 8 + src/infiniop/ops/where/metax/where_metax.maca | 62 ++++ src/infiniop/ops/where/nvidia/where_nvidia.cu | 91 ++++++ .../ops/where/nvidia/where_nvidia.cuh | 8 + src/infiniop/ops/where/operator.cc | 148 +++++++++ test/infiniop/where.py | 288 ++++++++++++++++++ 52 files changed, 1507 insertions(+), 508 deletions(-) create mode 100644 include/infiniop/ops/where.h create mode 100644 src/infiniop-test/src/ops/where.cpp create mode 100644 src/infiniop/ops/where/cpu/where_cpu.cc create mode 100644 src/infiniop/ops/where/cpu/where_cpu.h create mode 100644 src/infiniop/ops/where/cuda/kernel.cuh create mode 100644 src/infiniop/ops/where/metax/where_metax.h create mode 100644 src/infiniop/ops/where/metax/where_metax.maca create mode 100644 src/infiniop/ops/where/nvidia/where_nvidia.cu create mode 100644 src/infiniop/ops/where/nvidia/where_nvidia.cuh create mode 100644 src/infiniop/ops/where/operator.cc create mode 100644 test/infiniop/where.py diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h new file mode 100644 index 000000000..a328c312a --- /dev/null +++ b/include/infiniop/ops/where.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_WHERE_API_H__ +#define __INFINIOP_WHERE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t; + +__C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + infiniopTensorDescriptor_t condition); + +__C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream); + +__C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/src/ops/cast.cpp b/src/infiniop-test/src/ops/cast.cpp index d91f5eb6c..6547bc25a 100644 --- a/src/infiniop-test/src/ops/cast.cpp +++ b/src/infiniop-test/src/ops/cast.cpp @@ -58,8 +58,8 @@ std::shared_ptr Test::run( auto output = _attributes->output->to(device, device_id); CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc, - output->desc(), - input->desc()), + output->desc(), + input->desc()), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); size_t workspace_size; CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size), @@ -68,9 +68,9 @@ std::shared_ptr Test::run( CHECK_OR(infinirtMalloc(&workspace, workspace_size), return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); CHECK_OR(infiniopCast(op_desc, workspace, workspace_size, - output->data(), - input->data(), - nullptr), + output->data(), + input->data(), + nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { @@ -94,16 +94,16 @@ std::shared_ptr Test::run( return TEST_PASSED(elapsed_time); } -std::vector Test::attribute_names() { - return {}; +std::vector Test::attribute_names() { + return {}; } -std::vector Test::tensor_names() { - return {"input", "output", "ans"}; +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; } -std::vector Test::output_names() { - return {"output"}; +std::vector Test::output_names() { + return {"output"}; } std::string Test::toString() const { @@ -119,4 +119,4 @@ std::string Test::toString() const { Test::~Test() { delete _attributes; } -} // namespace infiniop_test::cast +} // namespace infiniop_test::cast diff --git a/src/infiniop-test/src/ops/cos.cpp b/src/infiniop-test/src/ops/cos.cpp index 7cae4574d..52de283af 100644 --- a/src/infiniop-test/src/ops/cos.cpp +++ b/src/infiniop-test/src/ops/cos.cpp @@ -86,16 +86,16 @@ std::shared_ptr Test::run( return TEST_PASSED(elapsed_time); } -std::vector Test::attribute_names() { - return {}; +std::vector Test::attribute_names() { + return {}; } -std::vector Test::tensor_names() { - return {"input", "output", "ans"}; +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; } -std::vector Test::output_names() { - return {"output"}; +std::vector Test::output_names() { + return {"output"}; } std::string Test::toString() const { @@ -111,4 +111,4 @@ std::string Test::toString() const { Test::~Test() { delete _attributes; } -} // namespace infiniop_test::cos +} // namespace infiniop_test::cos diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp index 395408e15..070f8ef6b 100644 --- a/src/infiniop-test/src/ops/exp.cpp +++ b/src/infiniop-test/src/ops/exp.cpp @@ -41,7 +41,6 @@ std::shared_ptr Test::build( test->_atol = 1e-6; } - return test; } @@ -87,16 +86,16 @@ std::shared_ptr Test::run( return TEST_PASSED(elapsed_time); } -std::vector Test::attribute_names() { - return {}; +std::vector Test::attribute_names() { + return {}; } -std::vector Test::tensor_names() { - return {"input", "output", "ans"}; +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; } -std::vector Test::output_names() { - return {"output"}; +std::vector Test::output_names() { + return {"output"}; } std::string Test::toString() const { @@ -112,4 +111,4 @@ std::string Test::toString() const { Test::~Test() { delete _attributes; } -} // namespace infiniop_test::exp +} // namespace infiniop_test::exp diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp index 25b161ccf..0ccf4f52a 100644 --- a/src/infiniop-test/src/ops/hardswish.cpp +++ b/src/infiniop-test/src/ops/hardswish.cpp @@ -50,8 +50,8 @@ std::shared_ptr Test::run( auto input = _attributes->input->to(device, device_id); auto output = _attributes->output->to(device, device_id); CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc, - output->desc(), - input->desc()), + output->desc(), + input->desc()), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); size_t workspace_size; CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size), @@ -60,9 +60,9 @@ std::shared_ptr Test::run( CHECK_OR(infinirtMalloc(&workspace, workspace_size), return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size, - output->data(), - input->data(), - nullptr), + output->data(), + input->data(), + nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { @@ -86,16 +86,16 @@ std::shared_ptr Test::run( return TEST_PASSED(elapsed_time); } -std::vector Test::attribute_names() { - return {}; +std::vector Test::attribute_names() { + return {}; } -std::vector Test::tensor_names() { - return {"input", "output", "ans"}; +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; } -std::vector Test::output_names() { - return {"output"}; +std::vector Test::output_names() { + return {"output"}; } std::string Test::toString() const { @@ -111,4 +111,4 @@ std::string Test::toString() const { Test::~Test() { delete _attributes; } -} // namespace infiniop_test::hardswish +} // namespace infiniop_test::hardswish diff --git a/src/infiniop-test/src/ops/leakyrelu.cpp b/src/infiniop-test/src/ops/leakyrelu.cpp index c63741120..b7d9eb89c 100644 --- a/src/infiniop-test/src/ops/leakyrelu.cpp +++ b/src/infiniop-test/src/ops/leakyrelu.cpp @@ -54,11 +54,11 @@ std::shared_ptr Test::run( auto input = _attributes->input->to(device, device_id); auto output = _attributes->output->to(device, device_id); CHECK_OR(infiniopCreateLeakyreluDescriptor(handle, &op_desc, - output->desc(), - input->desc(), - _attributes->negative_slope), + output->desc(), + input->desc(), + _attributes->negative_slope), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); - + size_t workspace_size; CHECK_OR(infiniopGetLeakyreluWorkspaceSize(op_desc, &workspace_size), return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); @@ -68,9 +68,9 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); } CHECK_OR(infiniopLeakyrelu(op_desc, workspace, workspace_size, - output->data(), - input->data(), - nullptr), + output->data(), + input->data(), + nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { @@ -94,16 +94,16 @@ std::shared_ptr Test::run( return TEST_PASSED(elapsed_time); } -std::vector Test::attribute_names() { - return {"negative_slope"}; +std::vector Test::attribute_names() { + return {"negative_slope"}; } -std::vector Test::tensor_names() { - return {"input", "output", "ans"}; +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; } -std::vector Test::output_names() { - return {"output"}; +std::vector Test::output_names() { + return {"output"}; } std::string Test::toString() const { @@ -120,4 +120,4 @@ std::string Test::toString() const { Test::~Test() { delete _attributes; } -} // namespace infiniop_test::leakyrelu +} // namespace infiniop_test::leakyrelu diff --git a/src/infiniop-test/src/ops/sigmoid_backward.cpp b/src/infiniop-test/src/ops/sigmoid_backward.cpp index 116055300..434dbf598 100644 --- a/src/infiniop-test/src/ops/sigmoid_backward.cpp +++ b/src/infiniop-test/src/ops/sigmoid_backward.cpp @@ -54,9 +54,9 @@ std::shared_ptr Test::run( auto grad_output = _attributes->grad_output->to(device, device_id); auto grad_input = _attributes->grad_input->to(device, device_id); CHECK_OR(infiniopCreateSigmoidBackwardDescriptor(handle, &op_desc, - grad_input->desc(), - input->desc(), - grad_output->desc()), + grad_input->desc(), + input->desc(), + grad_output->desc()), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); size_t workspace_size; CHECK_OR(infiniopGetSigmoidBackwardWorkspaceSize(op_desc, &workspace_size), @@ -65,10 +65,10 @@ std::shared_ptr Test::run( CHECK_OR(infinirtMalloc(&workspace, workspace_size), return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); CHECK_OR(infiniopSigmoidBackward(op_desc, workspace, workspace_size, - grad_input->data(), - input->data(), - grad_output->data(), - nullptr), + grad_input->data(), + input->data(), + grad_output->data(), + nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { @@ -93,16 +93,16 @@ std::shared_ptr Test::run( return TEST_PASSED(elapsed_time); } -std::vector Test::attribute_names() { - return {}; +std::vector Test::attribute_names() { + return {}; } -std::vector Test::tensor_names() { - return {"input", "grad_output", "grad_input", "ans"}; +std::vector Test::tensor_names() { + return {"input", "grad_output", "grad_input", "ans"}; } -std::vector Test::output_names() { - return {"grad_input"}; +std::vector Test::output_names() { + return {"grad_input"}; } std::string Test::toString() const { @@ -119,4 +119,4 @@ std::string Test::toString() const { Test::~Test() { delete _attributes; } -} // namespace infiniop_test::sigmoid_backward +} // namespace infiniop_test::sigmoid_backward diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp index db256c283..e1406e588 100644 --- a/src/infiniop-test/src/ops/sin.cpp +++ b/src/infiniop-test/src/ops/sin.cpp @@ -86,16 +86,16 @@ std::shared_ptr Test::run( return TEST_PASSED(elapsed_time); } -std::vector Test::attribute_names() { - return {}; +std::vector Test::attribute_names() { + return {}; } -std::vector Test::tensor_names() { - return {"input", "output", "ans"}; +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; } -std::vector Test::output_names() { - return {"output"}; +std::vector Test::output_names() { + return {"output"}; } std::string Test::toString() const { @@ -111,4 +111,4 @@ std::string Test::toString() const { Test::~Test() { delete _attributes; } -} // namespace infiniop_test::sin +} // namespace infiniop_test::sin diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp index bb8c6b081..6aeb3c301 100644 --- a/src/infiniop-test/src/ops/tanh.cpp +++ b/src/infiniop-test/src/ops/tanh.cpp @@ -50,8 +50,8 @@ std::shared_ptr Test::run( auto input = _attributes->input->to(device, device_id); auto output = _attributes->output->to(device, device_id); CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc, - output->desc(), - input->desc()), + output->desc(), + input->desc()), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); size_t workspace_size; CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size), @@ -60,9 +60,9 @@ std::shared_ptr Test::run( CHECK_OR(infinirtMalloc(&workspace, workspace_size), return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size, - output->data(), - input->data(), - nullptr), + output->data(), + input->data(), + nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { @@ -86,16 +86,16 @@ std::shared_ptr Test::run( return TEST_PASSED(elapsed_time); } -std::vector Test::attribute_names() { - return {}; +std::vector Test::attribute_names() { + return {}; } -std::vector Test::tensor_names() { - return {"input", "output", "ans"}; +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; } -std::vector Test::output_names() { - return {"output"}; +std::vector Test::output_names() { + return {"output"}; } std::string Test::toString() const { @@ -111,4 +111,4 @@ std::string Test::toString() const { Test::~Test() { delete _attributes; } -} // namespace infiniop_test::tanh +} // namespace infiniop_test::tanh diff --git a/src/infiniop-test/src/ops/where.cpp b/src/infiniop-test/src/ops/where.cpp new file mode 100644 index 000000000..fea9cba92 --- /dev/null +++ b/src/infiniop-test/src/ops/where.cpp @@ -0,0 +1,151 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::where { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr condition; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("condition") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->condition = tensors["condition"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->a->ggml_type(); + if (elemType == GGML_TYPE_I8) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I16) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I32) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I64) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_F64) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopWhereDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto condition = _attributes->condition->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateWhereDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc(), + condition->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetWhereWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopWhere(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + condition->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopWhere( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + condition->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "condition", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- condition: " << _attributes->condition->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::where diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc index 9a8bcc5a5..36d2e9e28 100644 --- a/src/infiniop/ops/cast/cpu/cast_cpu.cc +++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc @@ -46,7 +46,9 @@ static inline void cpu_cast_impl_incremental( const std::vector &in_stride = info.in_stride; const std::vector &out_stride = info.out_stride; - if (n == 0) return; + if (n == 0) { + return; + } std::vector idx(ndim, 0); ptrdiff_t in_off = 0; @@ -59,15 +61,23 @@ static inline void cpu_cast_impl_incremental( for (int d = static_cast(ndim) - 1; d >= 0; --d) { idx[d] += 1; - if (in_stride[d] != 0) in_off += in_stride[d]; - if (out_stride[d] != 0) out_off += out_stride[d]; + if (in_stride[d] != 0) { + in_off += in_stride[d]; + } + if (out_stride[d] != 0) { + out_off += out_stride[d]; + } if (idx[d] < shape[d]) { break; } else { idx[d] = 0; - if (in_stride[d] != 0) in_off -= static_cast(shape[d]) * in_stride[d]; - if (out_stride[d] != 0) out_off -= static_cast(shape[d]) * out_stride[d]; + if (in_stride[d] != 0) { + in_off -= static_cast(shape[d]) * in_stride[d]; + } + if (out_stride[d] != 0) { + out_off -= static_cast(shape[d]) * out_stride[d]; + } } } } @@ -80,39 +90,39 @@ infiniStatus_t Descriptor::calculate( const void *input, void *stream) const { - if (output == const_cast(input)) { + if (output == const_cast(input)) { return INFINI_STATUS_BAD_PARAM; // or INFINI_STATUS_INPLACE_NOT_SUPPORTED } - #define CASE_OUT(DT_OUT, TOUT) \ - case DT_OUT: { \ - switch (_info.dt_in) { \ - case INFINI_DTYPE_I32: \ - cpu_cast_impl_incremental(output, input, _info); \ - break; \ - case INFINI_DTYPE_I64: \ - cpu_cast_impl_incremental(output, input, _info); \ - break; \ - case INFINI_DTYPE_U32: \ - cpu_cast_impl_incremental(output, input, _info); \ - break; \ - case INFINI_DTYPE_U64: \ - cpu_cast_impl_incremental(output, input, _info); \ - break; \ - case INFINI_DTYPE_F16: \ - cpu_cast_impl_incremental(output, input, _info); \ - break; \ - case INFINI_DTYPE_F32: \ - cpu_cast_impl_incremental(output, input, _info); \ - break; \ - case INFINI_DTYPE_F64: \ - cpu_cast_impl_incremental(output, input, _info); \ - break; \ - default: \ - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ - } \ - break; \ - } +#define CASE_OUT(DT_OUT, TOUT) \ + case DT_OUT: { \ + switch (_info.dt_in) { \ + case INFINI_DTYPE_I32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_I64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_U32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_U64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F16: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + break; \ + } switch (_info.dt_out) { CASE_OUT(INFINI_DTYPE_I32, int32_t); @@ -126,10 +136,9 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CASE_OUT +#undef CASE_OUT return INFINI_STATUS_SUCCESS; } - } // namespace op::cast::cpu diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh index eee801b12..3736442a3 100644 --- a/src/infiniop/ops/cast/cuda/kernel.cuh +++ b/src/infiniop/ops/cast/cuda/kernel.cuh @@ -65,8 +65,12 @@ __global__ void cast_kernel( } else { idx_d = 0; } - if (in_stride[d] != 0) in_off += static_cast(idx_d) * in_stride[d]; - if (out_stride[d] != 0) out_off += static_cast(idx_d) * out_stride[d]; + if (in_stride[d] != 0) { + in_off += static_cast(idx_d) * in_stride[d]; + } + if (out_stride[d] != 0) { + out_off += static_cast(idx_d) * out_stride[d]; + } } out[static_cast(out_off)] = device_cast(in[static_cast(in_off)]); } diff --git a/src/infiniop/ops/cast/info.h b/src/infiniop/ops/cast/info.h index 4283a8224..8f85f6da8 100644 --- a/src/infiniop/ops/cast/info.h +++ b/src/infiniop/ops/cast/info.h @@ -10,7 +10,7 @@ namespace op::cast { class CastInfo { CastInfo() = default; -public: +public: infiniDtype_t dt_in; infiniDtype_t dt_out; std::vector shape; @@ -21,9 +21,9 @@ class CastInfo { static utils::Result create( infiniopTensorDescriptor_t out_desc, infiniopTensorDescriptor_t in_desc) { - + auto dt_out = out_desc->dtype(); - auto dt_in = in_desc->dtype(); + auto dt_in = in_desc->dtype(); CHECK_DTYPE(dt_in, INFINI_DTYPE_I32, INFINI_DTYPE_I64, @@ -40,14 +40,16 @@ class CastInfo { } size_t n = 1; - for (size_t i = 0; i < in_desc->ndim(); ++i) n *= static_cast(in_desc->dim(i)); + for (size_t i = 0; i < in_desc->ndim(); ++i) { + n *= static_cast(in_desc->dim(i)); + } return utils::Result(CastInfo{ - dt_in, - dt_out, - out_desc->shape(), - in_desc->strides(), - out_desc->strides(), + dt_in, + dt_out, + out_desc->shape(), + in_desc->strides(), + out_desc->strides(), n, }); } diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu index 2ad20c203..8e7eea473 100644 --- a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu @@ -1,12 +1,12 @@ -#include "../cuda/kernel.cuh" #include "../../../devices/nvidia/nvidia_handle.cuh" #include "../cast.h" -#include "cast_nvidia.cuh" +#include "../cuda/kernel.cuh" #include "../info.h" -#include +#include "cast_nvidia.cuh" #include -#include #include +#include +#include namespace op::cast::nvidia { @@ -18,8 +18,14 @@ Descriptor::~Descriptor() { delete _opaque; } -template struct MapCudaType { using Type = T; }; -template <> struct MapCudaType { using Type = half; }; +template +struct MapCudaType { + using Type = T; +}; +template <> +struct MapCudaType { + using Type = half; +}; infiniStatus_t Descriptor::create( infiniopHandle_t handle_, @@ -49,18 +55,18 @@ size_t Descriptor::workspaceSize() const { template static inline infiniStatus_t cuda_cast_impl_incremental( - void *output_, const void *input_, - const op::cast::CastInfo &info, + void *output_, const void *input_, + const op::cast::CastInfo &info, void *stream_) { int bs = 256, grid = 0; cudaError_t propErr; int device_id_local = 0; using DevTout = typename MapCudaType::Type; - using DevTin = typename MapCudaType::Type; + using DevTin = typename MapCudaType::Type; auto out_dev = reinterpret_cast(output_); - auto in_dev = reinterpret_cast(input_); + auto in_dev = reinterpret_cast(input_); auto stream = reinterpret_cast(stream_); int ndim = static_cast(info.shape.size()); @@ -88,22 +94,38 @@ static inline infiniStatus_t cuda_cast_impl_incremental( cudaError_t err = cudaSuccess; err = cudaMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } device_id_local = 0; propErr = cudaGetDevice(&device_id_local); @@ -112,24 +134,36 @@ static inline infiniStatus_t cuda_cast_impl_incremental( if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) { bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); } else { - if (bs > 256) bs = 256; + if (bs > 256) { + bs = 256; + } } } else { - if (bs > 256) bs = 256; + if (bs > 256) { + bs = 256; + } } - if (bs <= 0) bs = 256; + if (bs <= 0) { + bs = 256; + } grid = static_cast((info.n + bs - 1) / bs); - if (grid <= 0) grid = 1; + if (grid <= 0) { + grid = 1; + } cast_kernel<<>>( out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim); err = cudaGetLastError(); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaStreamSynchronize(stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } cudaFree(d_shape); cudaFree(d_div); @@ -152,39 +186,39 @@ infiniStatus_t Descriptor::calculate( const void *input, void *stream) const { - if (output == const_cast(input)) { + if (output == const_cast(input)) { return INFINI_STATUS_BAD_PARAM; } - #define CASE_OUT(DT_OUT, TOUT) \ - case DT_OUT: { \ - switch (_info.dt_in) { \ - case INFINI_DTYPE_I32: \ - cuda_cast_impl_incremental(output, input, _info, stream); \ - break; \ - case INFINI_DTYPE_I64: \ - cuda_cast_impl_incremental(output, input, _info, stream); \ - break; \ - case INFINI_DTYPE_U32: \ - cuda_cast_impl_incremental(output, input, _info, stream); \ - break; \ - case INFINI_DTYPE_U64: \ - cuda_cast_impl_incremental(output, input, _info, stream); \ - break; \ - case INFINI_DTYPE_F16: \ - cuda_cast_impl_incremental(output, input, _info, stream); \ - break; \ - case INFINI_DTYPE_F32: \ - cuda_cast_impl_incremental(output, input, _info, stream); \ - break; \ - case INFINI_DTYPE_F64: \ - cuda_cast_impl_incremental(output, input, _info, stream); \ - break; \ - default: \ - return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ - } \ - break; \ - } +#define CASE_OUT(DT_OUT, TOUT) \ + case DT_OUT: { \ + switch (_info.dt_in) { \ + case INFINI_DTYPE_I32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_I64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F16: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + break; \ + } switch (_info.dt_out) { CASE_OUT(INFINI_DTYPE_I32, int32_t); @@ -198,7 +232,7 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } - #undef CASE_OUT +#undef CASE_OUT return INFINI_STATUS_SUCCESS; } diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc index fc3aef4ad..12d26953b 100644 --- a/src/infiniop/ops/cast/operator.cc +++ b/src/infiniop/ops/cast/operator.cc @@ -24,7 +24,7 @@ __C infiniStatus_t infiniopCreateCastDescriptor( handle, \ reinterpret_cast(desc_ptr), \ output_desc, \ - input_desc) \ + input_desc) switch (handle->device) { @@ -50,8 +50,8 @@ __C infiniStatus_t infiniopCreateCastDescriptor( __C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -84,8 +84,8 @@ __C infiniStatus_t infiniopCast( const void *input, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, output, input, stream) @@ -114,9 +114,9 @@ __C infiniStatus_t infiniopCast( __C infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h index 37efb7597..af324eb80 100644 --- a/src/infiniop/ops/cos/cpu/cos_cpu.h +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -1,8 +1,8 @@ #ifndef __COS_CPU_H__ #define __COS_CPU_H__ -#include #include "../../../elementwise/cpu/elementwise_cpu.h" +#include ELEMENTWISE_DESCRIPTOR(cos, cpu) diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh index 381a897f0..5db7ee8f6 100644 --- a/src/infiniop/ops/cos/cuda/kernel.cuh +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -1,48 +1,47 @@ #ifndef __COS_CUDA_H__ #define __COS_CUDA_H__ -#include -#include #include +#include +#include namespace op::cos::cuda { typedef struct CosOp { - static constexpr size_t num_inputs = 1; + static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &input) const { - auto cos_f32 = [] __device__ (float x) { - double xd = static_cast(x); - double yd = std::cos(xd); - return static_cast(yd); - }; + template + __device__ __forceinline__ T operator()(const T &input) const { + auto cos_f32 = [] __device__(float x) { + double xd = static_cast(x); + double yd = std::cos(xd); + return static_cast(yd); + }; - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2( - cos_f32(vf.x), - cos_f32(vf.y) - ); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float xf = __half2float(input); - float yf = cos_f32(xf); - return __float2half_rn(yf); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - return __floats2bfloat162_rz(cos_f32(f0), cos_f32(f1)); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(input); - return __float2bfloat16_rz(cos_f32(xf)); - } else if constexpr (std::is_same_v) { - return cos_f32(input); - } else if constexpr (std::is_same_v) { - return std::cos(input); - } else { - return std::cos(input); + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2( + cos_f32(vf.x), + cos_f32(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = cos_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rz(cos_f32(f0), cos_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + return __float2bfloat16_rz(cos_f32(xf)); + } else if constexpr (std::is_same_v) { + return cos_f32(input); + } else if constexpr (std::is_same_v) { + return std::cos(input); + } else { + return std::cos(input); + } } - } } CosOp; } // namespace op::cos::cuda diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu index a3c38bc89..433363c91 100644 --- a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create( const auto &input_desc = input_desc_vec.at(0); const auto &output_shape = out_desc->shape(); const auto &input_shape = input_desc->shape(); - + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); CHECK_SAME_SHAPE(output_shape, input_shape); diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc index 11781d591..71a5f807c 100644 --- a/src/infiniop/ops/cos/operator.cc +++ b/src/infiniop/ops/cos/operator.cc @@ -18,13 +18,13 @@ __C infiniStatus_t infiniopCreateCosDescriptor( infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::cos::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - {input_desc}) \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) switch (handle->device) { @@ -114,9 +114,9 @@ __C infiniStatus_t infiniopCos( __C infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h index fbf9ab126..867c7afa5 100644 --- a/src/infiniop/ops/exp/cpu/exp_cpu.h +++ b/src/infiniop/ops/exp/cpu/exp_cpu.h @@ -1,8 +1,8 @@ #ifndef __EXP_CPU_H__ #define __EXP_CPU_H__ -#include #include "../../../elementwise/cpu/elementwise_cpu.h" +#include ELEMENTWISE_DESCRIPTOR(exp, cpu) diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh index 316a393be..12446f31a 100644 --- a/src/infiniop/ops/exp/cuda/kernel.cuh +++ b/src/infiniop/ops/exp/cuda/kernel.cuh @@ -1,39 +1,39 @@ #ifndef __EXP_CUDA_H__ #define __EXP_CUDA_H__ -#include -#include #include +#include +#include namespace op::exp::cuda { typedef struct ExpOp { - static constexpr size_t num_inputs = 1; + static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &input) const { - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2(__expf(vf.x), __expf(vf.y)); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float inputf = __half2float(input); - return __float2half_rn(__expf(inputf)); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - return __floats2bfloat162_rn(__expf(f0), __expf(f1)); - } else if constexpr (std::is_same_v) { - float inputf = __bfloat162float(input); - return __float2bfloat16_rn(__expf(inputf)); - } else if constexpr (std::is_same_v) { - return __expf(input); - } else if constexpr (std::is_same_v) { - return std::exp(input); - } else { - return std::exp(input); + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__expf(vf.x), __expf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__expf(f0), __expf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + return __expf(input); + } else if constexpr (std::is_same_v) { + return std::exp(input); + } else { + return std::exp(input); + } } - } } ExpOp; -} // namespace +} // namespace op::exp::cuda #endif // __EXP_CUDA_H__ diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu index f4229a942..3bdf2eb45 100644 --- a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu @@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create( const auto &input_desc = input_desc_vec.at(0); const auto &output_shape = out_desc->shape(); const auto &input_shape = input_desc->shape(); - + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); CHECK_SAME_SHAPE(output_shape, input_shape); diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc index 56f5d29cd..ee1dc6768 100644 --- a/src/infiniop/ops/exp/operator.cc +++ b/src/infiniop/ops/exp/operator.cc @@ -18,13 +18,13 @@ __C infiniStatus_t infiniopCreateExpDescriptor( infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::exp::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - {input_desc}) \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::exp::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) switch (handle->device) { @@ -114,9 +114,9 @@ __C infiniStatus_t infiniopExp( __C infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h index a42009017..e137be8a0 100644 --- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -1,8 +1,8 @@ #ifndef __HARDSWISH_CPU_H__ #define __HARDSWISH_CPU_H__ -#include #include "../../../elementwise/cpu/elementwise_cpu.h" +#include ELEMENTWISE_DESCRIPTOR(hardswish, cpu) diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh index be22e5faa..d5b369bce 100644 --- a/src/infiniop/ops/hardswish/cuda/kernel.cuh +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -1,54 +1,53 @@ #ifndef __HARDSWISH_CUDA_H__ #define __HARDSWISH_CUDA_H__ -#include -#include #include +#include +#include namespace op::hardswish::cuda { typedef struct HardswishOp { - static constexpr size_t num_inputs = 1; + static constexpr size_t num_inputs = 1; - // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 - __device__ __forceinline__ float hswish_f32(float x) const { - float y = x + 3.0f; - y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y); - return x * (y * (1.0f / 6.0f)); - } + // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 + __device__ __forceinline__ float hswish_f32(float x) const { + float y = x + 3.0f; + y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y); + return x * (y * (1.0f / 6.0f)); + } - template - __device__ __forceinline__ T operator()(const T &input) const { - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2( - hswish_f32(vf.x), - hswish_f32(vf.y) - ); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float xf = __half2float(input); - float yf = hswish_f32(xf); - return __float2half_rn(yf); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1)); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(input); - return __float2bfloat16_rz(hswish_f32(xf)); - } else if constexpr (std::is_same_v) { - return hswish_f32(input); - } else if constexpr (std::is_same_v) { - double xd = static_cast(input); - double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); - return static_cast(yd); - } else { - double xd = static_cast(input); - double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); - return static_cast(yd); + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2( + hswish_f32(vf.x), + hswish_f32(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = hswish_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + return __float2bfloat16_rz(hswish_f32(xf)); + } else if constexpr (std::is_same_v) { + return hswish_f32(input); + } else if constexpr (std::is_same_v) { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } else { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } } - } } HardswishOp; } // namespace op::hardswish::cuda diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu index 0aff55cd2..9e279c2ef 100644 --- a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create( const auto &input_desc = input_desc_vec.at(0); const auto &output_shape = out_desc->shape(); const auto &input_shape = input_desc->shape(); - + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); CHECK_SAME_SHAPE(output_shape, input_shape); diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc index 7787c799b..e8ba19fc1 100644 --- a/src/infiniop/ops/hardswish/operator.cc +++ b/src/infiniop/ops/hardswish/operator.cc @@ -18,13 +18,13 @@ __C infiniStatus_t infiniopCreateHardswishDescriptor( infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::hardswish::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - {input_desc}) \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardswish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) switch (handle->device) { @@ -50,8 +50,8 @@ __C infiniStatus_t infiniopCreateHardswishDescriptor( __C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -84,8 +84,8 @@ __C infiniStatus_t infiniopHardswish( const void *input, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, output, {input}, stream) @@ -114,9 +114,9 @@ __C infiniStatus_t infiniopHardswish( __C infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc index cd56f0ca6..c10a44cb5 100644 --- a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc +++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc @@ -40,7 +40,9 @@ static inline void cpu_leakyrelu_impl_incremental( const size_t ndim = info.shape.size(); const size_t n = info.n; - if (n == 0) return; + if (n == 0) { + return; + } auto out_base = reinterpret_cast(output); auto in_base = reinterpret_cast(input); @@ -62,15 +64,23 @@ static inline void cpu_leakyrelu_impl_incremental( *out_elem = utils::cast(outv); for (int d = static_cast(ndim) - 1; d >= 0; --d) { idx[d] += 1; - if (in_stride[d] != 0) in_off += in_stride[d]; - if (out_stride[d] != 0) out_off += out_stride[d]; + if (in_stride[d] != 0) { + in_off += in_stride[d]; + } + if (out_stride[d] != 0) { + out_off += out_stride[d]; + } if (idx[d] < shape[d]) { break; } else { idx[d] = 0; - if (in_stride[d] != 0) in_off -= static_cast(shape[d]) * in_stride[d]; - if (out_stride[d] != 0) out_off -= static_cast(shape[d]) * out_stride[d]; + if (in_stride[d] != 0) { + in_off -= static_cast(shape[d]) * in_stride[d]; + } + if (out_stride[d] != 0) { + out_off -= static_cast(shape[d]) * out_stride[d]; + } } } } @@ -83,22 +93,22 @@ infiniStatus_t Descriptor::calculate( const void *input, void *stream) const { - switch (_info.dt_in) { + switch (_info.dt_in) { case INFINI_DTYPE_F16: cpu_leakyrelu_impl_incremental(output, input, _info); - break; + break; case INFINI_DTYPE_BF16: cpu_leakyrelu_impl_incremental(output, input, _info); - break; - case INFINI_DTYPE_F32: + break; + case INFINI_DTYPE_F32: cpu_leakyrelu_impl_incremental(output, input, _info); - break; - case INFINI_DTYPE_F64: + break; + case INFINI_DTYPE_F64: cpu_leakyrelu_impl_incremental(output, input, _info); - break; - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } return INFINI_STATUS_SUCCESS; } } // namespace op::leakyrelu::cpu diff --git a/src/infiniop/ops/leakyrelu/cuda/kernel.cuh b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh index abad71b6a..afca17002 100644 --- a/src/infiniop/ops/leakyrelu/cuda/kernel.cuh +++ b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh @@ -1,8 +1,8 @@ #ifndef __LEAKYRELU_CUDA_KERNEL_CUH__ #define __LEAKYRELU_CUDA_KERNEL_CUH__ -#include #include +#include #include #include @@ -56,8 +56,12 @@ __global__ void leakyrelu_kernel( } else { idx_d = 0; } - if (in_stride[d] != 0) in_off += static_cast(idx_d) * in_stride[d]; - if (out_stride[d] != 0) out_off += static_cast(idx_d) * out_stride[d]; + if (in_stride[d] != 0) { + in_off += static_cast(idx_d) * in_stride[d]; + } + if (out_stride[d] != 0) { + out_off += static_cast(idx_d) * out_stride[d]; + } } float v = to_float_for_leaky(in[static_cast(in_off)]); diff --git a/src/infiniop/ops/leakyrelu/info.h b/src/infiniop/ops/leakyrelu/info.h index dd0a2d3ad..1f074d85a 100644 --- a/src/infiniop/ops/leakyrelu/info.h +++ b/src/infiniop/ops/leakyrelu/info.h @@ -34,7 +34,9 @@ class LeakyReLUInfo { } size_t n = 1; - for (size_t i = 0; i < in_desc->ndim(); ++i) n *= static_cast(in_desc->dim(i)); + for (size_t i = 0; i < in_desc->ndim(); ++i) { + n *= static_cast(in_desc->dim(i)); + } return utils::Result(LeakyReLUInfo{ dt_in, @@ -42,8 +44,7 @@ class LeakyReLUInfo { in_desc->strides(), out_desc->strides(), n, - negative_slope - }); + negative_slope}); } }; diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu index 05d149d5e..9b65bc421 100644 --- a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu +++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu @@ -1,13 +1,13 @@ -#include "../cuda/kernel.cuh" #include "../../../devices/nvidia/nvidia_handle.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" #include "../leakyrelu.h" #include "leakyrelu_nvidia.cuh" -#include "../info.h" -#include #include -#include #include +#include +#include namespace op::leakyrelu::nvidia { @@ -19,10 +19,19 @@ Descriptor::~Descriptor() { delete _opaque; } -template struct MapCudaType { using Type = T; }; -template <> struct MapCudaType { using Type = half; }; +template +struct MapCudaType { + using Type = T; +}; +template <> +struct MapCudaType { + using Type = half; +}; #if defined(__CUDA_BF16_TYPES_EXIST__) || defined(__CUDA_ARCH__) -template <> struct MapCudaType { using Type = __nv_bfloat16; }; +template <> +struct MapCudaType { + using Type = __nv_bfloat16; +}; #endif infiniStatus_t Descriptor::create( @@ -54,8 +63,8 @@ size_t Descriptor::workspaceSize() const { template static inline infiniStatus_t cuda_leakyrelu_impl_incremental( - void *output_, const void *input_, - const op::leakyrelu::LeakyReLUInfo &info, + void *output_, const void *input_, + const op::leakyrelu::LeakyReLUInfo &info, void *stream_) { int bs = 256, grid = 0; @@ -64,7 +73,7 @@ static inline infiniStatus_t cuda_leakyrelu_impl_incremental( using DevT = typename MapCudaType::Type; auto out_dev = reinterpret_cast(output_); - auto in_dev = reinterpret_cast(input_); + auto in_dev = reinterpret_cast(input_); auto stream = reinterpret_cast(stream_); int ndim = static_cast(info.shape.size()); @@ -93,21 +102,37 @@ static inline infiniStatus_t cuda_leakyrelu_impl_incremental( cudaError_t err = cudaSuccess; err = cudaMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } device_id_local = 0; propErr = cudaGetDevice(&device_id_local); @@ -116,24 +141,36 @@ static inline infiniStatus_t cuda_leakyrelu_impl_incremental( if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) { bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); } else { - if (bs > 256) bs = 256; + if (bs > 256) { + bs = 256; + } } } else { - if (bs > 256) bs = 256; + if (bs > 256) { + bs = 256; + } } - if (bs <= 0) bs = 256; + if (bs <= 0) { + bs = 256; + } grid = static_cast((info.n + bs - 1) / bs); - if (grid <= 0) grid = 1; + if (grid <= 0) { + grid = 1; + } leakyrelu_kernel<<>>( out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim); err = cudaGetLastError(); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } err = cudaStreamSynchronize(stream); - if (err != cudaSuccess) goto cleanup; + if (err != cudaSuccess) { + goto cleanup; + } cudaFree(d_shape); cudaFree(d_div); @@ -156,22 +193,22 @@ infiniStatus_t Descriptor::calculate( const void *input, void *stream) const { - switch (_info.dt_in) { + switch (_info.dt_in) { case INFINI_DTYPE_F16: cuda_leakyrelu_impl_incremental(output, input, _info, stream); - break; + break; case INFINI_DTYPE_BF16: cuda_leakyrelu_impl_incremental(output, input, _info, stream); - break; - case INFINI_DTYPE_F32: + break; + case INFINI_DTYPE_F32: cuda_leakyrelu_impl_incremental(output, input, _info, stream); - break; - case INFINI_DTYPE_F64: + break; + case INFINI_DTYPE_F64: cuda_leakyrelu_impl_incremental(output, input, _info, stream); - break; - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } return INFINI_STATUS_SUCCESS; } diff --git a/src/infiniop/ops/leakyrelu/operator.cc b/src/infiniop/ops/leakyrelu/operator.cc index ad6d504a8..3f78a4916 100644 --- a/src/infiniop/ops/leakyrelu/operator.cc +++ b/src/infiniop/ops/leakyrelu/operator.cc @@ -19,13 +19,13 @@ __C infiniStatus_t infiniopCreateLeakyreluDescriptor( infiniopTensorDescriptor_t x_desc, float negative_slope) { -#define CREATE_LEAKY(CASE, NAMESPACE) \ - case CASE: \ - return op::leakyrelu::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - y_desc, \ - x_desc, \ +#define CREATE_LEAKY(CASE, NAMESPACE) \ + case CASE: \ + return op::leakyrelu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ negative_slope) switch (handle->device) { @@ -58,8 +58,8 @@ __C infiniStatus_t infiniopCreateLeakyreluDescriptor( __C infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS @@ -95,8 +95,8 @@ __C infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor __C infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, void *workspace, size_t workspace_size, void *y, const void *x, void *stream) { -#define CALC_LEAKY(CASE, NAMESPACE) \ - case CASE: \ +#define CALC_LEAKY(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc)->calculate( \ workspace, workspace_size, y, x, stream) @@ -130,9 +130,9 @@ __C infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, void *w __C infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc) { -#define DESTROY_LEAKY(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DESTROY_LEAKY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h index b2f87c2ea..32537ef17 100644 --- a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h @@ -11,9 +11,8 @@ typedef struct SigmoidBackwardOp { static constexpr size_t num_inputs = 2; template T operator()(const T &x, const T &grad_out) const { - using ComputeT = - std::conditional_t || std::is_same_v, - float, T>; + using ComputeT = std::conditional_t || std::is_same_v, + float, T>; ComputeT xv = utils::cast(x); ComputeT gov = utils::cast(grad_out); diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh index 6c10dd26e..42c850004 100644 --- a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh +++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh @@ -1,10 +1,10 @@ #ifndef __SIGMOID_BACKWARD_CUDA_H__ #define __SIGMOID_BACKWARD_CUDA_H__ -#include -#include -#include #include +#include +#include +#include namespace op::sigmoid_backward::cuda { typedef struct SigmoidBackwardOp { diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc index f30a646d0..40a279f4b 100644 --- a/src/infiniop/ops/sigmoid_backward/operator.cc +++ b/src/infiniop/ops/sigmoid_backward/operator.cc @@ -19,13 +19,13 @@ __C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor( infiniopTensorDescriptor_t input_desc, infiniopTensorDescriptor_t grad_output_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ return op::sigmoid_backward::NAMESPACE::Descriptor::create( \ - handle, \ + handle, \ reinterpret_cast(desc_ptr), \ - grad_input_desc, \ - {input_desc, \ + grad_input_desc, \ + {input_desc, \ grad_output_desc}) switch (handle->device) { @@ -52,8 +52,8 @@ __C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor( __C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS @@ -87,8 +87,8 @@ __C infiniStatus_t infiniopSigmoidBackward( const void *grad_output, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) @@ -117,8 +117,8 @@ __C infiniStatus_t infiniopSigmoidBackward( __C infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h index e221c2573..80e406f98 100644 --- a/src/infiniop/ops/sin/cpu/sin_cpu.h +++ b/src/infiniop/ops/sin/cpu/sin_cpu.h @@ -1,8 +1,8 @@ #ifndef __SIN_CPU_H__ #define __SIN_CPU_H__ -#include #include "../../../elementwise/cpu/elementwise_cpu.h" +#include ELEMENTWISE_DESCRIPTOR(sin, cpu) diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh index c9993ca12..30641366c 100644 --- a/src/infiniop/ops/sin/cuda/kernel.cuh +++ b/src/infiniop/ops/sin/cuda/kernel.cuh @@ -1,38 +1,38 @@ #ifndef __SIN_CUDA_H__ #define __SIN_CUDA_H__ -#include -#include #include +#include +#include namespace op::sin::cuda { typedef struct SinOp { - static constexpr size_t num_inputs = 1; + static constexpr size_t num_inputs = 1; - template - __device__ __forceinline__ T operator()(const T &input) const { - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y)); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float inputf = __half2float(input); - return __float2half_rn(sinf(inputf)); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - return __floats2bfloat162_rn(__sinf(f0), __sinf(f1)); - } else if constexpr (std::is_same_v) { - float inputf = __bfloat162float(input); - return __float2bfloat16_rn(__sinf(inputf)); - } else if constexpr (std::is_same_v) { - return sinf(input); - } else if constexpr (std::is_same_v) { - return std::sin(input); - } else { - return std::sin(input); + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(sinf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__sinf(f0), __sinf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__sinf(inputf)); + } else if constexpr (std::is_same_v) { + return sinf(input); + } else if constexpr (std::is_same_v) { + return std::sin(input); + } else { + return std::sin(input); + } } - } } SinOp; } // namespace op::sin::cuda diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu index eaac7a582..6fbf952bc 100644 --- a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu @@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create( const auto &input_desc = input_desc_vec.at(0); const auto &output_shape = out_desc->shape(); const auto &input_shape = input_desc->shape(); - + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); CHECK_SAME_SHAPE(output_shape, input_shape); diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc index 38d8b242c..978561a04 100644 --- a/src/infiniop/ops/sin/operator.cc +++ b/src/infiniop/ops/sin/operator.cc @@ -18,13 +18,13 @@ __C infiniStatus_t infiniopCreateSinDescriptor( infiniopTensorDescriptor_t output_desc, infiniopTensorDescriptor_t input_desc) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::sin::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - {input_desc}) \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) switch (handle->device) { @@ -114,9 +114,9 @@ __C infiniStatus_t infiniopSin( __C infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h index 5dc73b383..73fd7c1b6 100644 --- a/src/infiniop/ops/tanh/cpu/tanh_cpu.h +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h @@ -1,8 +1,8 @@ #ifndef __TANH_CPU_H__ #define __TANH_CPU_H__ -#include #include "../../../elementwise/cpu/elementwise_cpu.h" +#include ELEMENTWISE_DESCRIPTOR(tanh, cpu) diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh index 49605aa93..62979a20e 100644 --- a/src/infiniop/ops/tanh/cuda/kernel.cuh +++ b/src/infiniop/ops/tanh/cuda/kernel.cuh @@ -1,45 +1,45 @@ #ifndef __TANH_CUDA_H__ #define __TANH_CUDA_H__ -#include -#include #include +#include +#include namespace op::tanh::cuda { typedef struct TanhOp { - static constexpr size_t num_inputs = 1; + static constexpr size_t num_inputs = 1; - __device__ __forceinline__ float tanh_f32_func(float x) const { - return tanhf(x); - } - template - __device__ __forceinline__ T operator()(const T &input) const { - if constexpr (std::is_same_v) { - float2 vf = __half22float2(input); - float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y)); - return __float22half2_rn(vr); - } else if constexpr (std::is_same_v) { - float xf = __half2float(input); - float yf = tanh_f32_func(xf); - return __float2half_rn(yf); - } else if constexpr (std::is_same_v) { - float f0 = __bfloat162float(__low2bfloat16(input)); - float f1 = __bfloat162float(__high2bfloat16(input)); - float r0 = tanh_f32_func(f0); - float r1 = tanh_f32_func(f1); - return __floats2bfloat162_rn(r0, r1); - } else if constexpr (std::is_same_v) { - float xf = __bfloat162float(input); - float rf = tanh_f32_func(xf); - return __float2bfloat16_rn(rf); - } else if constexpr (std::is_same_v) { - return tanh_f32_func(input); - } else if constexpr (std::is_same_v) { - return std::tanh(input); - } else { - return std::tanh(input); + __device__ __forceinline__ float tanh_f32_func(float x) const { + return tanhf(x); + } + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = tanh_f32_func(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + float r0 = tanh_f32_func(f0); + float r1 = tanh_f32_func(f1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + float rf = tanh_f32_func(xf); + return __float2bfloat16_rn(rf); + } else if constexpr (std::is_same_v) { + return tanh_f32_func(input); + } else if constexpr (std::is_same_v) { + return std::tanh(input); + } else { + return std::tanh(input); + } } - } } TanhOp; } // namespace op::tanh::cuda diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu index eeb6c85bf..a2c36551c 100644 --- a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu @@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create( const auto &input_desc = input_desc_vec.at(0); const auto &output_shape = out_desc->shape(); const auto &input_shape = input_desc->shape(); - + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); CHECK_SAME_SHAPE(output_shape, input_shape); diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc index a5ed56f74..d34d97df6 100644 --- a/src/infiniop/ops/tanh/operator.cc +++ b/src/infiniop/ops/tanh/operator.cc @@ -20,11 +20,11 @@ __C infiniStatus_t infiniopCreateTanhDescriptor( #define CREATE(CASE, NAMESPACE) \ case CASE: \ - return op::tanh::NAMESPACE::Descriptor::create( \ + return op::tanh::NAMESPACE::Descriptor::create( \ handle, \ - reinterpret_cast(desc_ptr), \ + reinterpret_cast(desc_ptr), \ output_desc, \ - {input_desc}) \ + {input_desc}) switch (handle->device) { @@ -50,8 +50,8 @@ __C infiniStatus_t infiniopCreateTanhDescriptor( __C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -84,8 +84,8 @@ __C infiniStatus_t infiniopTanh( const void *input, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc) \ ->calculate(workspace, workspace_size, output, {input}, stream) @@ -114,9 +114,9 @@ __C infiniStatus_t infiniopTanh( __C infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc new file mode 100644 index 000000000..de7e86e3e --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.cc @@ -0,0 +1,84 @@ +#include "where_cpu.h" + +namespace op::where::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::cpu diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h new file mode 100644 index 000000000..02ccab234 --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.h @@ -0,0 +1,19 @@ +#ifndef __WHERE_CPU_H__ +#define __WHERE_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(where, cpu) + +namespace op::where::cpu { +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + template + T operator()(const T &a, const T &b, const T &cond) const { + return cond ? a : b; + } +} WhereOp; +} // namespace op::where::cpu + +#endif // __WHERE_CPU_H__ diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh new file mode 100644 index 000000000..58e370aa4 --- /dev/null +++ b/src/infiniop/ops/where/cuda/kernel.cuh @@ -0,0 +1,15 @@ +#ifndef __WHERE_CUDA_H__ +#define __WHERE_CUDA_H__ + +namespace op::where::cuda { +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + template + __device__ __forceinline__ T operator()(const T &a, const T &b, const T &cond) const { + return cond ? a : b; + } +} WhereOp; +} // namespace op::where::cuda + +#endif // __WHERE_CUDA_H__ diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h new file mode 100644 index 000000000..43bb1a945 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.h @@ -0,0 +1,8 @@ +#ifndef __WHERE_METAX_API_H__ +#define __WHERE_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(where, metax) + +#endif // __WHERE_METAX_API_H__ diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca new file mode 100644 index 000000000..fb4be9325 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.maca @@ -0,0 +1,62 @@ +#include "where_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::where::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::metax diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu new file mode 100644 index 000000000..860089bd2 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu @@ -0,0 +1,91 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "where_nvidia.cuh" + +namespace op::where::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::nvidia diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh new file mode 100644 index 000000000..c168364a8 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __WHERE_CUDA_API_H__ +#define __WHERE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(where, nvidia) + +#endif // __WHERE_CUDA_API_H__ diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc new file mode 100644 index 000000000..d69b1d4e1 --- /dev/null +++ b/src/infiniop/ops/where/operator.cc @@ -0,0 +1,148 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/where.h" + +#ifdef ENABLE_CPU_API +#include "cpu/where_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/where_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/where_metax.h" +#endif + +__C infiniStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t condition_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::where::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc, \ + condition_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b, condition}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/where.py b/test/infiniop/where.py new file mode 100644 index 000000000..c940d4f05 --- /dev/null +++ b/test/infiniop/where.py @@ -0,0 +1,288 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, + to_torch_dtype, + torch_device_map, +) +from enum import Enum, auto + +# ====================================================================== +# Configuration (Internal Use Only) +# Now each test case tuple is: (shape, a_stride, b_stride, cond_stride, c_stride) +# ====================================================================== +_TEST_CASES_ = [ + ((13, 4), None, None, None, None), + ((13, 4), None, None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None, None), + ((13, 4, 4), None, None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None, None), + ((16, 5632), None, None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_COND = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, + Inplace.INPLACE_COND, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +_INTEGER_DTYPES = [ + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.U32, + InfiniDtype.U64, +] + +_FLOAT_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, + InfiniDtype.BF16, +] + +_TENSOR_DTYPES = _INTEGER_DTYPES + _FLOAT_DTYPES + +_TOLERANCE_MAP = { + InfiniDtype.I32: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.I64: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.U32: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.U64: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def is_supported_dt(inf_dt): + try: + td = to_torch_dtype(inf_dt, compatability_mode=True) + _ = torch.empty((1,), dtype=td, device="cpu") + return True + except Exception: + return False + +def _is_integer_dtype(inf_dt): + return inf_dt in _INTEGER_DTYPES + +def _is_unsigned_dtype(inf_dt): + return inf_dt in (InfiniDtype.U32, InfiniDtype.U64) + + +def make_integer_torch_tensor(shape, inf_dt, device): + use_compatibility = _is_unsigned_dtype(inf_dt) + + if inf_dt == InfiniDtype.I32: + low, high, dtype = -2000, 2000, torch.int32 + elif inf_dt == InfiniDtype.I64: + low, high, dtype = -2048, 2048, torch.int64 + elif inf_dt == InfiniDtype.U32: + low, high, dtype = 0, 2000, torch.int32 + elif inf_dt == InfiniDtype.U64: + low, high, dtype = 0, 2048, torch.int64 + else: + low, high, dtype = 0, 1, torch.int64 + + dev = torch_device_map[device] + + t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev) + + target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility) + if t.dtype != target_torch_dt: + t = t.to(dtype=target_torch_dt) + + return t + +def where_ref(c, a, b, cond): + cond_bool = cond.torch_tensor().to(torch.bool) + c.torch_tensor().copy_(torch.where(cond_bool, a.torch_tensor(), b.torch_tensor())) + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + cond_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + inf_dt = dtype + + if not is_supported_dt(inf_dt): + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on this platform") + return + + try: + if _is_integer_dtype(inf_dt): + a_torch = make_integer_torch_tensor(shape, inf_dt, device) + b_torch = make_integer_torch_tensor(shape, inf_dt, device) + a = TestTensor.from_torch(a_torch, inf_dt, device) + b = TestTensor.from_torch(b_torch, inf_dt, device) + else: + a = TestTensor(shape, a_stride, inf_dt, device, mode="random") + b = TestTensor(shape, b_stride, inf_dt, device, mode="random") + except RuntimeError as e: + msg = str(e) + if "not implemented for 'UInt32'" in msg or "not implemented for 'UInt64'" in msg or "check_uniform_bounds" in msg: + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} because platform torch can't build random tensor: {e}") + return + else: + raise + + dev = torch_device_map[device] + if _is_integer_dtype(inf_dt): + cond_torch = torch.randint(0, 2, size=shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev) + else: + cond_bool = (torch.rand(shape, device=dev) > 0.5) + cond_torch = cond_bool.to(dtype=to_torch_dtype(inf_dt, compatability_mode=False)) + + cond = TestTensor.from_torch(cond_torch, inf_dt, device) + + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + elif inplace == Inplace.INPLACE_COND: + if c_stride != cond_stride: + return + c = cond + else: + if _is_integer_dtype(inf_dt): + dev = torch_device_map[device] + c_torch = torch.zeros(shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev) + c = TestTensor.from_torch(c_torch, inf_dt, device) + else: + c = TestTensor(shape, c_stride, inf_dt, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Where on {InfiniDeviceNames[device]} " + f"shape:{shape} a_stride:{a_stride} b_stride:{b_stride} cond_stride:{cond_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[inf_dt]} inplace:{inplace}" + ) + + where_ref(c, a, b, cond) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + try: + check_error( + LIBINFINIOP.infiniopCreateWhereDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + cond.descriptor, + ) + ) + except Exception as e: + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on {InfiniDeviceNames[device]}: CreateWhereDescriptor failed: {e}") + return + + for tensor in [a, b, c, cond]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetWhereWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_where(): + check_error( + LIBINFINIOP.infiniopWhere( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + cond.data(), + None, + ) + ) + + lib_where() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, inf_dt) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + profile_operation("PyTorch", lambda: where_ref(c, a, b, cond), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_where(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor)) + + +def main(): + args = get_args() + global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + supported = [dt for dt in _TENSOR_DTYPES if is_supported_dt(dt)] + devices = get_test_devices(args) + + for device in devices: + test_operator(device, test, _TEST_CASES, supported) + + print("\033[92mTest passed!\033[0m") + + +if __name__ == "__main__": + main() From a126be00f80873ee02b11db78663aa96e8cc3720 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Mon, 18 Aug 2025 00:55:58 +0800 Subject: [PATCH 11/16] [T1-1-1]: operators clang-format --- include/infiniop.h | 14 ++++---- include/infiniop/ops/cast.h | 16 ++++----- include/infiniop/ops/hardswish.h | 16 ++++----- include/infiniop/ops/leakyrelu.h | 18 +++++----- include/infiniop/ops/sigmoid_backward.h | 20 +++++------ include/infiniop/ops/tanh.h | 16 ++++----- include/infiniop/ops/where.h | 24 +++++++------- src/infiniop-test/include/ops.hpp | 44 ++++++++++++------------- 8 files changed, 84 insertions(+), 84 deletions(-) diff --git a/include/infiniop.h b/include/infiniop.h index c86127cb2..30a07e4b4 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -4,26 +4,26 @@ #include "infiniop/handle.h" #include "infiniop/ops/add.h" #include "infiniop/ops/attention.h" +#include "infiniop/ops/cast.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" +#include "infiniop/ops/cos.h" +#include "infiniop/ops/exp.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/hardswish.h" +#include "infiniop/ops/leakyrelu.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" +#include "infiniop/ops/sigmoid_backward.h" +#include "infiniop/ops/sin.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" -#include "infiniop/ops/exp.h" -#include "infiniop/ops/sin.h" -#include "infiniop/ops/cos.h" -#include "infiniop/ops/leakyrelu.h" #include "infiniop/ops/tanh.h" -#include "infiniop/ops/sigmoid_backward.h" -#include "infiniop/ops/hardswish.h" -#include "infiniop/ops/cast.h" #include "infiniop/ops/where.h" #include "infiniop/tensor_descriptor.h" diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h index 82b41490e..81d771efe 100644 --- a/include/infiniop/ops/cast.h +++ b/include/infiniop/ops/cast.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopCastDescriptor_t; __C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle, - infiniopCastDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); __C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size); __C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); __C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc); diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h index 79a7c93ea..8d655fe82 100644 --- a/include/infiniop/ops/hardswish.h +++ b/include/infiniop/ops/hardswish.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t; __C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle, - infiniopHardswishDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); __C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size); __C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); __C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc); diff --git a/include/infiniop/ops/leakyrelu.h b/include/infiniop/ops/leakyrelu.h index 9ce93d53c..adc46d1c6 100644 --- a/include/infiniop/ops/leakyrelu.h +++ b/include/infiniop/ops/leakyrelu.h @@ -6,19 +6,19 @@ typedef struct InfiniopDescriptor *infiniopLeakyreluDescriptor_t; __C __export infiniStatus_t infiniopCreateLeakyreluDescriptor(infiniopHandle_t handle, - infiniopLeakyreluDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input, - float negative_slope); + infiniopLeakyreluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float negative_slope); __C __export infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size); __C __export infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); __C __export infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc); diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h index 2bcc5dee6..abab0cde7 100644 --- a/include/infiniop/ops/sigmoid_backward.h +++ b/include/infiniop/ops/sigmoid_backward.h @@ -6,20 +6,20 @@ typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t; __C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle, - infiniopSigmoidBackwardDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t grad_input, - infiniopTensorDescriptor_t input, - infiniopTensorDescriptor_t grad_output); + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t grad_output); __C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size); __C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *grad_input, - const void *input, - const void* grad_output, - void *stream); + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream); __C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc); diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h index 62974e951..742dba860 100644 --- a/include/infiniop/ops/tanh.h +++ b/include/infiniop/ops/tanh.h @@ -6,18 +6,18 @@ typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t; __C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle, - infiniopTanhDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t output, - infiniopTensorDescriptor_t input); + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); __C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size); __C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *output, - const void *input, - void *stream); + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); __C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc); diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h index a328c312a..713db102f 100644 --- a/include/infiniop/ops/where.h +++ b/include/infiniop/ops/where.h @@ -6,22 +6,22 @@ typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t; __C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, - infiniopWhereDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b, - infiniopTensorDescriptor_t condition); + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + infiniopTensorDescriptor_t condition); __C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size); __C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, - void *workspace, - size_t workspace_size, - void *c, - const void *a, - const void *b, - const void *condition, - void *stream); + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream); __C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 4c16eeec7..c3a120703 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -39,28 +39,28 @@ DECLARE_INFINIOP_TEST(where) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ - REGISTER_INFINIOP_TEST(exp) \ - REGISTER_INFINIOP_TEST(sin) \ - REGISTER_INFINIOP_TEST(cos) \ - REGISTER_INFINIOP_TEST(leakyrelu) \ - REGISTER_INFINIOP_TEST(tanh) \ - REGISTER_INFINIOP_TEST(sigmoid_backward)\ - REGISTER_INFINIOP_TEST(hardswish) \ - REGISTER_INFINIOP_TEST(cast) \ - REGISTER_INFINIOP_TEST(where) \ +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(exp) \ + REGISTER_INFINIOP_TEST(sin) \ + REGISTER_INFINIOP_TEST(cos) \ + REGISTER_INFINIOP_TEST(leakyrelu) \ + REGISTER_INFINIOP_TEST(tanh) \ + REGISTER_INFINIOP_TEST(sigmoid_backward) \ + REGISTER_INFINIOP_TEST(hardswish) \ + REGISTER_INFINIOP_TEST(cast) \ + REGISTER_INFINIOP_TEST(where) \ } namespace infiniop_test { From 92b15d0a2fe0a5e1920b62fbebe31683d8fd8a26 Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Sun, 24 Aug 2025 18:07:53 +0800 Subject: [PATCH 12/16] [T1-1-1]: Modify where operator condition with T->bool --- src/infiniop/ops/where/cpu/where_cpu.h | 2 +- src/infiniop/ops/where/cuda/kernel.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h index 02ccab234..3d86cb4f7 100644 --- a/src/infiniop/ops/where/cpu/where_cpu.h +++ b/src/infiniop/ops/where/cpu/where_cpu.h @@ -10,7 +10,7 @@ typedef struct WhereOp { public: static constexpr size_t num_inputs = 3; template - T operator()(const T &a, const T &b, const T &cond) const { + T operator()(const T &a, const T &b, const bool &cond) const { return cond ? a : b; } } WhereOp; diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh index 58e370aa4..8eb5c762b 100644 --- a/src/infiniop/ops/where/cuda/kernel.cuh +++ b/src/infiniop/ops/where/cuda/kernel.cuh @@ -6,7 +6,7 @@ typedef struct WhereOp { public: static constexpr size_t num_inputs = 3; template - __device__ __forceinline__ T operator()(const T &a, const T &b, const T &cond) const { + __device__ __forceinline__ T operator()(const T &a, const T &b, const bool &cond) const { return cond ? a : b; } } WhereOp; From 73a6994cde5f6db360cb4962f6eae1d5201c9d4b Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Wed, 27 Aug 2025 16:45:32 +0800 Subject: [PATCH 13/16] [T1-1-1]: Modify leakyrelu operator profile test --- test/infiniop/leakyrelu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/infiniop/leakyrelu.py b/test/infiniop/leakyrelu.py index 93a8170d2..76562ddf0 100644 --- a/test/infiniop/leakyrelu.py +++ b/test/infiniop/leakyrelu.py @@ -147,7 +147,7 @@ def lib_leakyrelu(): # Profiling workflow if PROFILE: # fmt: off - profile_operation("PyTorch", lambda: leakyrelu(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation("PyTorch", lambda: leakyrelu(output.torch_tensor(), input.torch_tensor(), negative_slope), device, NUM_PRERUN, NUM_ITERATIONS) profile_operation(" lib", lambda: lib_leakyrelu(), device, NUM_PRERUN, NUM_ITERATIONS) # fmt: on check_error(LIBINFINIOP.infiniopDestroyLeakyreluDescriptor(descriptor)) From 52f6d162fb9f98ec3abf10767809361a1dbdc76d Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Wed, 27 Aug 2025 20:30:01 +0800 Subject: [PATCH 14/16] [T1-1-1]: Modify where operator metax --- src/infiniop/ops/where/metax/where_metax.maca | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca index fb4be9325..46c47e541 100644 --- a/src/infiniop/ops/where/metax/where_metax.maca +++ b/src/infiniop/ops/where/metax/where_metax.maca @@ -19,16 +19,28 @@ infiniStatus_t Descriptor::create( const auto &a_desc = input_desc_vec.at(0); const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + const auto &c_shape = out_desc->shape(); const auto &a_shape = a_desc->shape(); const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); - CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); - // create CUDA elementwise descriptor - CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) return INFINI_STATUS_SUCCESS; } @@ -53,6 +65,24 @@ infiniStatus_t Descriptor::calculate( return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } From 394fffb29b668f7a91e9a5b1f1818a492487132b Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Wed, 27 Aug 2025 20:31:37 +0800 Subject: [PATCH 15/16] [T1-1-1]: Modify tanh operator cpp --- src/infiniop-test/src/ops/tanh.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp index 6aeb3c301..6f966de09 100644 --- a/src/infiniop-test/src/ops/tanh.cpp +++ b/src/infiniop-test/src/ops/tanh.cpp @@ -37,8 +37,8 @@ std::shared_ptr Test::build( test->_atol = 1e-3; } if (elemType == GGML_TYPE_F32) { - test->_rtol = 1e-7; - test->_atol = 1e-7; + test->_rtol = 1e-6; + test->_atol = 1e-6; } return test; From 10817ced9d0a9c0a6b2b0bffce8257a91f40e35b Mon Sep 17 00:00:00 2001 From: PPPoint <1024879159@qq.com> Date: Wed, 27 Aug 2025 20:50:44 +0800 Subject: [PATCH 16/16] [T1-1-1]: Modify where operator metax --- src/infiniop/ops/where/metax/where_metax.maca | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca index 46c47e541..b648cfbcc 100644 --- a/src/infiniop/ops/where/metax/where_metax.maca +++ b/src/infiniop/ops/where/metax/where_metax.maca @@ -40,7 +40,7 @@ infiniStatus_t Descriptor::create( CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); - CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) return INFINI_STATUS_SUCCESS; }