diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..05117dcb9 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -4,18 +4,27 @@ #include "infiniop/handle.h" #include "infiniop/ops/add.h" #include "infiniop/ops/attention.h" +#include "infiniop/ops/cast.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" +#include "infiniop/ops/cos.h" +#include "infiniop/ops/exp.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/hard_swish.h" +#include "infiniop/ops/leaky_relu.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" +#include "infiniop/ops/sigmoid_backward.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/tanh.h" +#include "infiniop/ops/sin.h" +#include "infiniop/ops/where.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h new file mode 100644 index 000000000..c4dd6ccfd --- /dev/null +++ b/include/infiniop/ops/cast.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_CAST_API_H__ +#define __INFINIOP_CAST_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCastDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle, + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h new file mode 100644 index 000000000..098c0d7e1 --- /dev/null +++ b/include/infiniop/ops/cos.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_COS_API_H__ +#define __INFINIOP_COS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCosDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h new file mode 100644 index 000000000..1b7defcc5 --- /dev/null +++ b/include/infiniop/ops/exp.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_EXP_API_H__ +#define __INFINIOP_EXP_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopExpDescriptor_t; + +__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/hard_swish.h b/include/infiniop/ops/hard_swish.h new file mode 100644 index 000000000..e97284a87 --- /dev/null +++ b/include/infiniop/ops/hard_swish.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_HARD_SWISH_API_H__ +#define __INFINIOP_HARD_SWISH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardSwishDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHardSwishDescriptor(infiniopHandle_t handle, + infiniopHardSwishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHardSwish(infiniopHardSwishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/leaky_relu.h b/include/infiniop/ops/leaky_relu.h new file mode 100644 index 000000000..0bd2a3349 --- /dev/null +++ b/include/infiniop/ops/leaky_relu.h @@ -0,0 +1,39 @@ +#ifndef __INFINIOP_LEAKY_RELU_API_H__ +#define __INFINIOP_LEAKY_RELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLeakyReluDescriptor_t; + +/// @brief 创建 LeakyReLU 描述符 +/// @param handle 上下文句柄 +/// @param desc_ptr 输出的算子描述符 +/// @param output 输出张量描述符 +/// @param input 输入张量描述符 +/// @param negative_slope 负斜率 α,float 类型 +__C __export infiniStatus_t infiniopCreateLeakyReluDescriptor( + infiniopHandle_t handle, + infiniopLeakyReluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +/// @brief 获取 workspace 大小 +__C __export infiniStatus_t infiniopGetLeakyReluWorkspaceSize( + infiniopLeakyReluDescriptor_t desc, + size_t *size); + +/// @brief 执行 LeakyReLU 运算 +__C __export infiniStatus_t infiniopLeakyRelu( + infiniopLeakyReluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + float negative_slope, + void *stream); + +/// @brief 销毁 LeakyReLU 描述符 +__C __export infiniStatus_t infiniopDestroyLeakyReluDescriptor( + infiniopLeakyReluDescriptor_t desc); + +#endif // __INFINIOP_LEAKY_RELU_API_H__ diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h new file mode 100644 index 000000000..60b42270c --- /dev/null +++ b/include/infiniop/ops/sigmoid_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_SIGMOID_BACKWARD_API_H__ +#define __INFINIOP_SIGMOID_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle, + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t grad_output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *grad_output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h new file mode 100644 index 000000000..79acec6ee --- /dev/null +++ b/include/infiniop/ops/sin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SIN_API_H__ +#define __INFINIOP_SIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h new file mode 100644 index 000000000..d31ddf396 --- /dev/null +++ b/include/infiniop/ops/tanh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TANH_API_H__ +#define __INFINIOP_TANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h new file mode 100644 index 000000000..0b3c0eabe --- /dev/null +++ b/include/infiniop/ops/where.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_WHERE_API_H__ +#define __INFINIOP_WHERE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t; + +__C __export infiniStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t condition_desc); + +__C __export infiniStatus_t infiniopGetWhereWorkspaceSize( + infiniopWhereDescriptor_t desc, + size_t *size); + +__C __export infiniStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream); + +#endif // __INFINIOP_WHERE_API_H__ diff --git a/src/infiniop/ops/cast/cast.h b/src/infiniop/ops/cast/cast.h new file mode 100644 index 000000000..b69ae286f --- /dev/null +++ b/src/infiniop/ops/cast/cast.h @@ -0,0 +1,46 @@ +#ifndef __CAST_H__ +#define __CAST_H__ + +#include "../../operator.h" + +#define DESCRIPTOR(NAMESPACE) \ +namespace op::cast::NAMESPACE { \ +class Descriptor final : public InfiniopDescriptor { \ + infiniDtype_t _output_dtype; \ + infiniDtype_t _input_dtype; \ + op::elementwise::ElementwiseInfo _info; \ + std::unique_ptr _device_info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t _out_dtype, \ + infiniDtype_t _input_dtype, \ + op::elementwise::ElementwiseInfo info, \ + op::elementwise::NAMESPACE::DeviceImpl *device_info, \ + size_t workspace_size_, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _output_dtype(_out_dtype), \ + _input_dtype(_input_dtype), \ + _info(std::move(info)), \ + _device_info(std::move(device_info)), \ + _workspace_size(workspace_size_) {} \ +public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + std::vector input_descs); \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const; \ +}; \ +} + + +#endif + diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc new file mode 100644 index 000000000..09d81c67a --- /dev/null +++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc @@ -0,0 +1,146 @@ +#include "cast_cpu.h" + + + +namespace op::cast::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto output_dtype = out_desc->dtype(); + auto input_dtype = input_desc_vec.at(0)->dtype(); + + const auto &out_shape = out_desc->shape(); + const auto &in_shape = input_desc_vec.at(0)->shape(); + + CHECK_SAME_SHAPE(out_shape, in_shape); + CHECK_DTYPE(output_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U32, INFINI_DTYPE_U64); + CHECK_DTYPE(input_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U32, INFINI_DTYPE_U64); + + // ✅ 使用 ElementwiseInfo::create 而不是构造函数 + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); // 检查是否创建成功 + + // 因为不使用 device_impl,传 nullptr 和 workspace_size = 0 + *desc_ptr = new Descriptor( + output_dtype, + input_dtype, + info_result.take(), + nullptr, + 0, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 简化类型映射宏 + #define DISPATCH_CAST(SRC, DST) \ + return _device_info->calculate(_info, output, inputs, stream); + + // dispatch by _output_dtype (目标类型) + switch (_output_dtype) { + case INFINI_DTYPE_F16: + switch (_input_dtype) { + // 其它类型转换为 half 类型 + case INFINI_DTYPE_F16: DISPATCH_CAST(fp16_t, fp16_t); + case INFINI_DTYPE_F32: DISPATCH_CAST(float, fp16_t); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, fp16_t); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, fp16_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, fp16_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, fp16_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, fp16_t); + default: break; + } + break; + + case INFINI_DTYPE_F32: + switch (_input_dtype) { + // 其它类型转换为 float 类型 + case INFINI_DTYPE_F16: DISPATCH_CAST(fp16_t, float); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, float); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, float); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, float); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, float); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, float); + case INFINI_DTYPE_F32: DISPATCH_CAST(float, float); + default: break; + } + break; + + case INFINI_DTYPE_F64: + switch (_input_dtype) { + case INFINI_DTYPE_F16: DISPATCH_CAST(fp16_t, double); + case INFINI_DTYPE_F32: DISPATCH_CAST(float, double); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, double); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, double); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, double); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, double); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, double); + default: break; + } + break; + + case INFINI_DTYPE_I32: + switch (_input_dtype) { + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, int32_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, int32_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, int32_t); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, int32_t); + default: break; + } + break; + + case INFINI_DTYPE_I64: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, int64_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, int64_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, int64_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, int64_t); + default: break; + } + break; + + case INFINI_DTYPE_U32: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, uint32_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, uint32_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, uint32_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, uint32_t); + default: break; + } + break; + + case INFINI_DTYPE_U64: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, uint64_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, uint64_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, uint64_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, uint64_t); + default: break; + } + break; + + default: + break; + } + + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} + +} // namespace op::cast::cpu \ No newline at end of file diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.h b/src/infiniop/ops/cast/cpu/cast_cpu.h new file mode 100644 index 000000000..472022bd2 --- /dev/null +++ b/src/infiniop/ops/cast/cpu/cast_cpu.h @@ -0,0 +1,24 @@ +#ifndef __CAST_CPU_H__ +#define __CAST_CPU_H__ + +#include "../cast.h" +#include "../../../elementwise/cpu/elementwise_cpu.h" + +#include "../../../../utils/custom_types.h" + +DESCRIPTOR(cpu) + +namespace op::cast::cpu { + + +typedef struct CastOp { +public: + static constexpr size_t num_inputs = 1; + template + TypeTo operator()(const TypeFrom &val) const { + return utils::cast(val); + }; +} CastOp; +} + +#endif // __CAST_CPU_H__ diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh new file mode 100644 index 000000000..01413e188 --- /dev/null +++ b/src/infiniop/ops/cast/cuda/kernel.cuh @@ -0,0 +1,43 @@ +#ifndef __CAST_CUDA_H__ +#define __CAST_CUDA_H__ + +template +__device__ __forceinline__ Dst convert_cast(const Src& x) { + // return utils::cast(x); + return static_cast(x); +} + +template <> +__device__ __forceinline__ float convert_cast(const half& x) { + return __half2float(x); +} + +// 特化2:uint64_t → __half(新增,解决歧义转换问题) +// 显式将 uint64_t 转为 unsigned long long(匹配 __half 的明确构造函数) +template <> +__device__ __forceinline__ __half convert_cast(const uint64_t& x) { + // 步骤1:先转 unsigned long long(匹配 __half(const unsigned long long val) 构造函数) + // 步骤2:再转 __half,消除编译器歧义 + return static_cast<__half>(static_cast(x)); +} + +namespace op::cast::cuda { +typedef struct CastOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const U &a) const { + return convert_cast(a); + } + + // 用于 elementwise 内核的显式模板调度(Elementwise Kernel 会显式传 ) + template + __device__ __forceinline__ Tout operator()(const Tin&... args) const { + static_assert(sizeof...(Tin) == 1, "CastOp expects exactly 1 input"); + const auto &x = std::get<0>(std::tie(args...)); + return (*this).operator()(x); + } +} CastOp; +} // namespace op::cast::cuda + +#endif // __CAST_CUDA_H__ diff --git a/src/infiniop/ops/cast/metax/cast_metax.h b/src/infiniop/ops/cast/metax/cast_metax.h new file mode 100644 index 000000000..9c9eab5e8 --- /dev/null +++ b/src/infiniop/ops/cast/metax/cast_metax.h @@ -0,0 +1,13 @@ +#ifndef __CAST_METAX_API_H__ +#define __CAST_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +// ELEMENTWISE_DESCRIPTOR(cast, metax) + +#include "../cast.h" + + +DESCRIPTOR(metax) + +#endif // __Cast_METAX_API_H__ diff --git a/src/infiniop/ops/cast/metax/cast_metax.maca b/src/infiniop/ops/cast/metax/cast_metax.maca new file mode 100644 index 000000000..90f63ec1b --- /dev/null +++ b/src/infiniop/ops/cast/metax/cast_metax.maca @@ -0,0 +1,152 @@ + +#include "cast_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cast::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto output_dtype = out_desc->dtype(); + auto input_dtype = input_desc_vec.at(0)->dtype(); + + const auto &out_shape = out_desc->shape(); + const auto &in_shape = input_desc_vec.at(0)->shape(); + + CHECK_SAME_SHAPE(out_shape, in_shape); + CHECK_DTYPE(output_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U32, INFINI_DTYPE_U64); + CHECK_DTYPE(input_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U32, INFINI_DTYPE_U64); + + // ✅ 使用 ElementwiseInfo::create 而不是构造函数 + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); // 检查是否创建成功 + + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + output_dtype, + input_dtype, + std::move(info), + std::move(device_impl_result.take()), + workspace_size, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 简化类型映射宏 + #define DISPATCH_CAST(SRC, DST) \ + return _device_info->calculate<256, cuda::CastOp, DST, SRC>(_info, workspace, output, inputs, stream); + + // dispatch by _output_dtype (目标类型) + switch (_output_dtype) { + case INFINI_DTYPE_F16: + switch (_input_dtype) { + case INFINI_DTYPE_F32: DISPATCH_CAST(float, half); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, half); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, half); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, half); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, half); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, half); + case INFINI_DTYPE_F16: DISPATCH_CAST(half, half); + default: break; + } + break; + + case INFINI_DTYPE_F32: + switch (_input_dtype) { + case INFINI_DTYPE_F16: DISPATCH_CAST(half, float); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, float); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, float); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, float); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, float); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, float); + case INFINI_DTYPE_F32: DISPATCH_CAST(float, float); + default: break; + } + break; + + case INFINI_DTYPE_F64: + switch (_input_dtype) { + case INFINI_DTYPE_F16: DISPATCH_CAST(half, double); + case INFINI_DTYPE_F32: DISPATCH_CAST(float, double); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, double); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, double); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, double); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, double); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, double); + default: break; + } + break; + + case INFINI_DTYPE_I32: + switch (_input_dtype) { + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, int32_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, int32_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, int32_t); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, int32_t); + default: break; + } + break; + + case INFINI_DTYPE_I64: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, int64_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, int64_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, int64_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, int64_t); + default: break; + } + break; + + case INFINI_DTYPE_U32: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, uint32_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, uint32_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, uint32_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, uint32_t); + default: break; + } + break; + + case INFINI_DTYPE_U64: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, uint64_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, uint64_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, uint64_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, uint64_t); + default: break; + } + break; + + default: + break; + } + + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} + +} // namespace op::cast::metax \ No newline at end of file diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu new file mode 100644 index 000000000..24dfb6537 --- /dev/null +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu @@ -0,0 +1,149 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cast_nvidia.cuh" + +namespace op::cast::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto output_dtype = out_desc->dtype(); + auto input_dtype = input_desc_vec.at(0)->dtype(); + + const auto &out_shape = out_desc->shape(); + const auto &in_shape = input_desc_vec.at(0)->shape(); + + CHECK_SAME_SHAPE(out_shape, in_shape); + CHECK_DTYPE(output_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U32, INFINI_DTYPE_U64); + CHECK_DTYPE(input_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_U32, INFINI_DTYPE_U64); + + // ✅ 使用 ElementwiseInfo::create 而不是构造函数 + auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); // 检查是否创建成功 + + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal()); + CHECK_RESULT(device_impl_result); + + *desc_ptr = new Descriptor( + output_dtype, + input_dtype, + std::move(info), + std::move(device_impl_result.take()), + workspace_size, + handle->device, + handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 简化类型映射宏 + #define DISPATCH_CAST(SRC, DST) \ + return _device_info->calculate<256, cuda::CastOp, DST, SRC>(_info, workspace, output, inputs, stream); + + // dispatch by _output_dtype (目标类型) + switch (_output_dtype) { + case INFINI_DTYPE_F16: + switch (_input_dtype) { + case INFINI_DTYPE_F32: DISPATCH_CAST(float, half); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, half); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, half); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, half); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, half); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, half); + case INFINI_DTYPE_F16: DISPATCH_CAST(half, half); + default: break; + } + break; + + case INFINI_DTYPE_F32: + switch (_input_dtype) { + case INFINI_DTYPE_F16: DISPATCH_CAST(half, float); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, float); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, float); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, float); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, float); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, float); + case INFINI_DTYPE_F32: DISPATCH_CAST(float, float); + default: break; + } + break; + + case INFINI_DTYPE_F64: + switch (_input_dtype) { + case INFINI_DTYPE_F16: DISPATCH_CAST(half, double); + case INFINI_DTYPE_F32: DISPATCH_CAST(float, double); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, double); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, double); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, double); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, double); + case INFINI_DTYPE_F64: DISPATCH_CAST(double, double); + default: break; + } + break; + + case INFINI_DTYPE_I32: + switch (_input_dtype) { + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, int32_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, int32_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, int32_t); + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, int32_t); + default: break; + } + break; + + case INFINI_DTYPE_I64: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, int64_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, int64_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, int64_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, int64_t); + default: break; + } + break; + + case INFINI_DTYPE_U32: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, uint32_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, uint32_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, uint32_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, uint32_t); + default: break; + } + break; + + case INFINI_DTYPE_U64: + switch (_input_dtype) { + case INFINI_DTYPE_I32: DISPATCH_CAST(int32_t, uint64_t); + case INFINI_DTYPE_I64: DISPATCH_CAST(int64_t, uint64_t); + case INFINI_DTYPE_U32: DISPATCH_CAST(uint32_t, uint64_t); + case INFINI_DTYPE_U64: DISPATCH_CAST(uint64_t, uint64_t); + default: break; + } + break; + + default: + break; + } + + return INFINI_STATUS_BAD_TENSOR_DTYPE; +} + +} // namespace op::cast::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh new file mode 100644 index 000000000..05c07efcf --- /dev/null +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh @@ -0,0 +1,12 @@ +#ifndef __CAST_CUDA_API_H__ +#define __CAST_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + + +#include "../cast.h" + + +DESCRIPTOR(nvidia) + +#endif // __CAST_CUDA_API_H__ diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc new file mode 100644 index 000000000..4892291c3 --- /dev/null +++ b/src/infiniop/ops/cast/operator.cc @@ -0,0 +1,147 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cast.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cast_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cast_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cast_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCastDescriptor( + infiniopHandle_t handle, + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cast::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + {input}) + + // 根据设备类型选择对应的实现 + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); // CPU实现 +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); // NVIDIA实现 +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); // ILUVATAR实现,复用NVIDIA +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); // METAX实现 +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; // 不支持的设备类型 + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型获取工作空间大小 + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCast( + infiniopCastDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + // 根据设备类型调用对应的计算实现 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型释放对应的描述符 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc new file mode 100644 index 000000000..9d9d7d471 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc @@ -0,0 +1,53 @@ +#include "cos_cpu.h" + +namespace op::cos::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::cpu diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h new file mode 100644 index 000000000..dcf62cd9c --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -0,0 +1,21 @@ +#ifndef __COS_CPU_H__ +#define __COS_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include +#include + +ELEMENTWISE_DESCRIPTOR(cos, cpu) + +namespace op::cos::cpu { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &a) const { + return std::cos(a); + } +} CosOp; +} // namespace op::cos::cpu + +#endif // __COS_CPU_H__ diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh new file mode 100644 index 000000000..a92069df7 --- /dev/null +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -0,0 +1,24 @@ +#ifndef __COS_CUDA_H__ +#define __COS_CUDA_H__ + +#include + +namespace op::cos::cuda { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + return hsin2(cosf(a)); + } else if constexpr (std::is_same_v || std::is_same_v) { + return hcos(a); + } else { + // fallback for other types + return ::cos(a); + } + } +} CosOp; +} // namespace op::cos::cuda + +#endif // __COS_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/cos/metax/cos_metax.h b/src/infiniop/ops/cos/metax/cos_metax.h new file mode 100644 index 000000000..a98fa3211 --- /dev/null +++ b/src/infiniop/ops/cos/metax/cos_metax.h @@ -0,0 +1,8 @@ +#ifndef __COS_METAX_API_H__ +#define __COS_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(cos, metax) + +#endif // __COS_METAX_API_H__ diff --git a/src/infiniop/ops/cos/metax/cos_metax.maca b/src/infiniop/ops/cos/metax/cos_metax.maca new file mode 100644 index 000000000..86be2fe24 --- /dev/null +++ b/src/infiniop/ops/cos/metax/cos_metax.maca @@ -0,0 +1,72 @@ + +#include "cos_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cos::metax { + +// 析构函数,默认实现 +Descriptor::~Descriptor() = default; + +// 创建cos算子的描述符 +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 将句柄转换为 metax 设备句柄 + auto handle = reinterpret_cast(handle_); + // 获取输出张量的数据类型 + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + // 检查数据类型是否支持 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 检查输入输出张量形状是否一致 + CHECK_SAME_SHAPE(c_shape, x_shape); + + // 创建 CUDA 上的 elementwise 描述符 + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +// 执行Cos计算 +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 检查工作空间是否足够 + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 根据数据类型选择对应的 CUDA kernel 进行计算 + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::metax + diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu new file mode 100644 index 000000000..49114190d --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cos_nvidia.cuh" + +namespace op::cos::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::nvidia diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh new file mode 100644 index 000000000..f6c350dd6 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __COS_CUDA_API_H__ +#define __COS_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cos, nvidia) + +#endif // __COS_CUDA_API_H__ diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc new file mode 100644 index 000000000..8f72903da --- /dev/null +++ b/src/infiniop/ops/cos/operator.cc @@ -0,0 +1,151 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cos.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cos_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cos_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cos_metax.h" +#endif + +// 创建cos算子的描述符 +__C infiniStatus_t infiniopCreateCosDescriptor( + infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + {input}) + + // 根据设备类型选择对应的实现 + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); // CPU实现 +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); // NVIDIA实现 +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); // ILUVATAR实现,复用NVIDIA +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); // METAX实现 +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; // 不支持的设备类型 + } + +#undef CREATE +} + +// 获取Cos算子的工作空间大小 +__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型获取工作空间大小 + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +// 执行Cos算子计算 +__C infiniStatus_t infiniopCos( + infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + // 根据设备类型调用对应的计算实现 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +// 销毁Cos算子的描述符 +__C infiniStatus_t +infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型释放对应的描述符 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc new file mode 100644 index 000000000..375c082f7 --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc @@ -0,0 +1,53 @@ +#include "exp_cpu.h" + +namespace op::exp::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::cpu diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h new file mode 100644 index 000000000..bd1e37248 --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.h @@ -0,0 +1,19 @@ +#ifndef __EXP_CPU_H__ +#define __EXP_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(exp, cpu) + +namespace op::exp::cpu { +typedef struct ExpOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &a) const { + return ::exp(a);; + } +} ExpOp; +} // namespace op::exp::cpu + +#endif // __EXP_CPU_H__ diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh new file mode 100644 index 000000000..336ef6e04 --- /dev/null +++ b/src/infiniop/ops/exp/cuda/kernel.cuh @@ -0,0 +1,26 @@ +#ifndef __EXP_CUDA_H__ +#define __EXP_CUDA_H__ + + +namespace op::exp::cuda { +typedef struct ExpOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + return hexp(a); // 半精度 + } else if constexpr (std::is_same_v) { + return __expf(__bfloat162float(a)); // 先转成float再exp + } else if constexpr (std::is_same_v) { + return __expf(a); // 快速 float 指数函数 + } else if constexpr (std::is_same_v) { + return ::exp(a); // 双精度标准库 + } else { + return ::exp(a); + } + } +} ExpOp; +} // namespace op::exp::cuda + +#endif // __EXP_CUDA_H__ diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h new file mode 100644 index 000000000..d5721f57f --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.h @@ -0,0 +1,8 @@ +#ifndef __EXP_METAX_API_H__ +#define __EXP_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(exp, metax) + +#endif // __Exp_METAX_API_H__ diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca new file mode 100644 index 000000000..77e41ed7a --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.maca @@ -0,0 +1,72 @@ + +#include "exp_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::exp::metax { + +// 析构函数,默认实现 +Descriptor::~Descriptor() = default; + +// 创建exp算子的描述符 +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 将句柄转换为 metax 设备句柄 + auto handle = reinterpret_cast(handle_); + // 获取输出张量的数据类型 + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + // 检查数据类型是否支持 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 检查输入输出张量形状是否一致 + CHECK_SAME_SHAPE(c_shape, x_shape); + + // 创建 CUDA 上的 elementwise 描述符 + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +// 执行Exp计算 +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 检查工作空间是否足够 + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 根据数据类型选择对应的 CUDA kernel 进行计算 + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::metax + diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu new file mode 100644 index 000000000..406b3dc48 --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "exp_nvidia.cuh" + +namespace op::exp::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::nvidia diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh new file mode 100644 index 000000000..7545e8f3e --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EXP_CUDA_API_H__ +#define __EXP_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(exp, nvidia) + +#endif // __EXP_CUDA_API_H__ diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc new file mode 100644 index 000000000..dc4bbe7c5 --- /dev/null +++ b/src/infiniop/ops/exp/operator.cc @@ -0,0 +1,151 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/exp.h" + +#ifdef ENABLE_CPU_API +#include "cpu/exp_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/exp_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/exp_metax.h" +#endif + +// 创建Exp算子的描述符 +__C infiniStatus_t infiniopCreateExpDescriptor( + infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::exp::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + {input}) + + // 根据设备类型选择对应的实现 + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); // CPU实现 +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); // NVIDIA实现 +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); // ILUVATAR实现,复用NVIDIA +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); // METAX实现 +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; // 不支持的设备类型 + } + +#undef CREATE +} + +// 获取Exp算子的工作空间大小 +__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型获取工作空间大小 + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +// 执行Exp算子计算 +__C infiniStatus_t infiniopExp( + infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + // 根据设备类型调用对应的计算实现 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +// 销毁Exp算子的描述符 +__C infiniStatus_t +infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型释放对应的描述符 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/hard_swish/cpu/hard_swish_cpu.cc b/src/infiniop/ops/hard_swish/cpu/hard_swish_cpu.cc new file mode 100644 index 000000000..2a22f2f75 --- /dev/null +++ b/src/infiniop/ops/hard_swish/cpu/hard_swish_cpu.cc @@ -0,0 +1,53 @@ +#include "hard_swish_cpu.h" + +namespace op::hard_swish::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hard_swish::cpu diff --git a/src/infiniop/ops/hard_swish/cpu/hard_swish_cpu.h b/src/infiniop/ops/hard_swish/cpu/hard_swish_cpu.h new file mode 100644 index 000000000..10d14c30e --- /dev/null +++ b/src/infiniop/ops/hard_swish/cpu/hard_swish_cpu.h @@ -0,0 +1,22 @@ +#ifndef __HARD_SWISH_CPU_H__ +#define __HARD_SWISH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include +#include + +ELEMENTWISE_DESCRIPTOR(hard_swish, cpu) + +namespace op::hard_swish::cpu { +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &a) const { + T relu6 = std::min(std::max(a + static_cast(3), static_cast(0)), static_cast(6)); + return a * relu6 / static_cast(6); + } +} HardSwishOp; +} // namespace op::hard_swish::cpu + +#endif // __HARD_SWISH_CPU_H__ diff --git a/src/infiniop/ops/hard_swish/cuda/kernel.cuh b/src/infiniop/ops/hard_swish/cuda/kernel.cuh new file mode 100644 index 000000000..57c10e08a --- /dev/null +++ b/src/infiniop/ops/hard_swish/cuda/kernel.cuh @@ -0,0 +1,37 @@ +#ifndef __HARD_SWISH_CUDA_H__ +#define __HARD_SWISH_CUDA_H__ + + +namespace op::hard_swish::cuda { +typedef struct HardSwishOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &a) const { + if constexpr (std::is_same_v) { + half three = __float2half(3.0f); + half six = __float2half(6.0f); + half zero = __float2half(0.0f); + half tmp = __hadd(a, three); + half clipped = __hmin(__hmax(tmp, zero), six); + return __hmul(a, __hdiv(clipped, six)); + } else if constexpr (std::is_same_v) { + float af = __bfloat162float(a); + float relu6 = fminf(fmaxf(af + 3.0f, 0.0f), 6.0f); + float result = af * relu6 / 6.0f; + return __float2bfloat16(result); + } else if constexpr (std::is_same_v) { + float relu6 = fminf(fmaxf(a + 3.0f, 0.0f), 6.0f); + return a * relu6 / 6.0f; + } else if constexpr (std::is_same_v) { + double relu6 = fmin(fmax(a + 3.0, 0.0), 6.0); + return a * relu6 / 6.0; + } else { + auto relu6 = std::min(std::max(a + T(3), T(0)), T(6)); + return a * relu6 / T(6); + } + } +} HardSwishOp; +} // namespace op::hard_swish::cuda + +#endif // __HARD_SWISH_CUDA_H__ diff --git a/src/infiniop/ops/hard_swish/metax/hard_swish_metax.h b/src/infiniop/ops/hard_swish/metax/hard_swish_metax.h new file mode 100644 index 000000000..ba3bfcf64 --- /dev/null +++ b/src/infiniop/ops/hard_swish/metax/hard_swish_metax.h @@ -0,0 +1,8 @@ +#ifndef __HARD_SWISH_METAX_API_H__ +#define __HARD_SWISH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(hard_swish, metax) + +#endif // __HARD_SWISH_METAX_API_H__ diff --git a/src/infiniop/ops/hard_swish/metax/hard_swish_metax.maca b/src/infiniop/ops/hard_swish/metax/hard_swish_metax.maca new file mode 100644 index 000000000..65bdd17d7 --- /dev/null +++ b/src/infiniop/ops/hard_swish/metax/hard_swish_metax.maca @@ -0,0 +1,72 @@ + +#include "hard_swish_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hard_swish::metax { + +// 析构函数,默认实现 +Descriptor::~Descriptor() = default; + +// 创建hard_swish算子的描述符 +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 将句柄转换为 metax 设备句柄 + auto handle = reinterpret_cast(handle_); + // 获取输出张量的数据类型 + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + // 检查数据类型是否支持 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 检查输入输出张量形状是否一致 + CHECK_SAME_SHAPE(c_shape, x_shape); + + // 创建 CUDA 上的 elementwise 描述符 + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +// 执行HardSwish计算 +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 检查工作空间是否足够 + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 根据数据类型选择对应的 CUDA kernel 进行计算 + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hard_swish::metax + diff --git a/src/infiniop/ops/hard_swish/nvidia/hard_swish_nvidia.cu b/src/infiniop/ops/hard_swish/nvidia/hard_swish_nvidia.cu new file mode 100644 index 000000000..8e36155d9 --- /dev/null +++ b/src/infiniop/ops/hard_swish/nvidia/hard_swish_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hard_swish_nvidia.cuh" + +namespace op::hard_swish::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &c_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hard_swish::nvidia diff --git a/src/infiniop/ops/hard_swish/nvidia/hard_swish_nvidia.cuh b/src/infiniop/ops/hard_swish/nvidia/hard_swish_nvidia.cuh new file mode 100644 index 000000000..8c67a0de4 --- /dev/null +++ b/src/infiniop/ops/hard_swish/nvidia/hard_swish_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HARD_SWISH_CUDA_API_H__ +#define __HARD_SWISH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hard_swish, nvidia) + +#endif // __HARD_SWISH_CUDA_API_H__ diff --git a/src/infiniop/ops/hard_swish/operator.cc b/src/infiniop/ops/hard_swish/operator.cc new file mode 100644 index 000000000..f6774621f --- /dev/null +++ b/src/infiniop/ops/hard_swish/operator.cc @@ -0,0 +1,151 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hard_swish.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hard_swish_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hard_swish_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hard_swish_metax.h" +#endif + +// 创建hard_swish算子的描述符 +__C infiniStatus_t infiniopCreateHardSwishDescriptor( + infiniopHandle_t handle, + infiniopHardSwishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hard_swish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + {input}) + + // 根据设备类型选择对应的实现 + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); // CPU实现 +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); // NVIDIA实现 +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); // ILUVATAR实现,复用NVIDIA +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); // METAX实现 +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; // 不支持的设备类型 + } + +#undef CREATE +} + +// 获取HardSwish算子的工作空间大小 +__C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型获取工作空间大小 + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +// 执行HardSwish算子计算 +__C infiniStatus_t infiniopHardSwish( + infiniopHardSwishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + // 根据设备类型调用对应的计算实现 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +// 销毁HardSwish算子的描述符 +__C infiniStatus_t +infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型释放对应的描述符 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc new file mode 100644 index 000000000..ca9ce93cb --- /dev/null +++ b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.cc @@ -0,0 +1,56 @@ +#include "leaky_relu_cpu.h" + +namespace op::leaky_relu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec +) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + float negative_slope, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream, negative_slope); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream, negative_slope); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream, negative_slope); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream, negative_slope); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} +// namespace op::leaky_relu::cpu diff --git a/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h new file mode 100644 index 000000000..ed786d27f --- /dev/null +++ b/src/infiniop/ops/leaky_relu/cpu/leaky_relu_cpu.h @@ -0,0 +1,63 @@ +#ifndef __LEAKY_RELU_CPU_H__ +#define __LEAKY_RELU_CPU_H__ + +#include + +// 引入elementwise(自动定义Descriptor)的宏 +#include "../../../elementwise/cpu/elementwise_cpu.h" + +// 取消自动定义Descriptor的宏函数 +// ELEMENTWISE_DESCRIPTOR(leaky_relu, cpu) + +namespace op::leaky_relu::cpu { + class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::cpu::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + + public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + float negative_slope, + void *stream) const; + }; + } + +namespace op::leaky_relu::cpu { +typedef struct LeakyReluOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x, float negative_slope) const { + return x >= T(0) ? x : negative_slope * x; + } +} LeakyReluOp; +} // namespace op::leaky_relu::cpu + +#endif // __LEAKY_RELU_CPU_H__ diff --git a/src/infiniop/ops/leaky_relu/cuda/kernel.cuh b/src/infiniop/ops/leaky_relu/cuda/kernel.cuh new file mode 100644 index 000000000..d1ccc3633 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/cuda/kernel.cuh @@ -0,0 +1,37 @@ +#ifndef __LEAKY_RELU_CUDA_H__ +#define __LEAKY_RELU_CUDA_H__ + + +namespace op::leaky_relu::cuda { +typedef struct LeakyReluOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x, float negative_slope) const { + if constexpr (std::is_same_v) { + const half2 zero = __float2half2_rn(0.0f); + const half2 slope = __float2half2_rn(negative_slope); + return __hge2(x, zero) ? x : __hmul2(slope, x); + } else if constexpr (std::is_same_v) { + // Resolution 1 + return __hge(x, __float2half(0.0f)) ? x : __hmul(__float2half(negative_slope), x); + // Resolution 2 + // const half zero = __float2half_rn(0.0f); + // const half slope = __float2half_rn(negative_slope); + // return __hge(x, zero) ? x : __hmul(slope, x); + // Resolution 3 + // float xf = __half2float(x); + // float res = xf >= 0.0f ? xf : negative_slope * xf; + // return __float2half(res); + } else if constexpr (std::is_same_v) { + return __bfloat162float(x) >= 0.0f ? x : __hmul(__float2bfloat16(negative_slope), x); + } else if constexpr (std::is_same_v) { + return x>=0.0f ? x : __fmul_rn(negative_slope, x); + } else { + return x>=0 ? x : negative_slope * x; + } + } +} LeakyReluOp; +} // namespace op::leaky_relu::cuda + +#endif // __LEAKY_RELU_CUDA_H__ diff --git a/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h new file mode 100644 index 000000000..9d2f70009 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.h @@ -0,0 +1,51 @@ +#ifndef __LEAKY_RELU_METAX_API_H__ +#define __LEAKY_RELU_METAX_API_H__ + +#include + +// #include "../../../elementwise/metax/elementwise_metax.h" +#include "../../../elementwise/metax/elementwise_metax_api.h" + +// ELEMENTWISE_DESCRIPTOR(leaky_relu, metax) + +namespace op::leaky_relu::metax { + class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::metax::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + + public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + float negative_slope, + void *stream) const; + }; + } + +#endif // __LEAKY_RELU_METAX_API_H__ diff --git a/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca new file mode 100644 index 000000000..9ad53068c --- /dev/null +++ b/src/infiniop/ops/leaky_relu/metax/leaky_relu_metax.maca @@ -0,0 +1,74 @@ + +#include "leaky_relu_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::leaky_relu::metax { + +// 析构函数,默认实现 +Descriptor::~Descriptor() = default; + +// 创建leaky_relu算子的描述符 +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, // 输入句柄 + Descriptor **desc_ptr, // 输出描述符指针 + infiniopTensorDescriptor_t out_desc, // 输出张量描述符 + std::vector input_desc_vec +) { // 输入张量描述符向量 + + // 将句柄转换为 metax 设备句柄 + auto handle = reinterpret_cast(handle_); + // 获取输出张量的数据类型 + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + // 检查数据类型是否支持 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 检查输入输出张量形状是否一致 + CHECK_SAME_SHAPE(output_shape, input_shape); + + // 创建 CUDA 上的 elementwise 描述符 + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +// 执行leaky_relu计算 +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + float negative_slope, + void *stream) const { + + // 检查工作空间是否足够 + // if (workspace_size < _workspace_size) { + // return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + // } + + // 根据数据类型选择对应的 CUDA kernel 进行计算 + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LeakyReluOp, half>(_info, workspace, output, inputs, stream, negative_slope); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LeakyReluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, negative_slope); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LeakyReluOp, float>(_info, workspace, output, inputs, stream, negative_slope); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LeakyReluOp, double>(_info, workspace, output, inputs, stream, negative_slope); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::leaky_relu::metax + diff --git a/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu new file mode 100644 index 000000000..413304216 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cu @@ -0,0 +1,60 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "leaky_relu_nvidia.cuh" + +namespace op::leaky_relu::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + float negative_slope, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LeakyReluOp, half>(_info, workspace, output, inputs, stream, negative_slope); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LeakyReluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, negative_slope); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LeakyReluOp, float>(_info, workspace, output, inputs, stream, negative_slope); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LeakyReluOp, double>(_info, workspace, output, inputs, stream, negative_slope); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::leaky_relu::nvidia diff --git a/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh new file mode 100644 index 000000000..560e53f78 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/nvidia/leaky_relu_nvidia.cuh @@ -0,0 +1,51 @@ +#ifndef __LEAKY_RELU_CUDA_API_H__ +#define __LEAKY_RELU_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +#include + +// ELEMENTWISE_DESCRIPTOR(leaky_relu, nvidia) + + +namespace op::leaky_relu::nvida { + class Descriptor final : public InfiniopDescriptor { + infiniDtype_t _dtype; + op::elementwise::ElementwiseInfo _info; + std::unique_ptr _device_info; + size_t _workspace_size; + + Descriptor( + infiniDtype_t dtype, + op::elementwise::ElementwiseInfo info, + op::elementwise::nvidia::DeviceImpl *device_info, + size_t workspace_size, + infiniDevice_t device_type, + int device_id) + : InfiniopDescriptor{device_type, device_id}, + _dtype(dtype), + _info(std::move(info)), + _device_info(std::move(device_info)), + _workspace_size(workspace_size) {} + + public: + ~Descriptor(); + + size_t workspaceSize() const { return _workspace_size; } + + static infiniStatus_t create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + std::vector input_descs); + + infiniStatus_t calculate( + void *workspace, size_t workspace_size, + void *output, + std::vector inputs, + float negative_slope, + void *stream) const; + }; + } + +#endif // __LEAKY_RELU_CUDA_API_H__ diff --git a/src/infiniop/ops/leaky_relu/operator.cc b/src/infiniop/ops/leaky_relu/operator.cc new file mode 100644 index 000000000..2d7838ca9 --- /dev/null +++ b/src/infiniop/ops/leaky_relu/operator.cc @@ -0,0 +1,140 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/leaky_relu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/leaky_relu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/leaky_relu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/leaky_relu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLeakyReluDescriptor( + infiniopHandle_t handle, + infiniopLeakyReluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::leaky_relu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output, \ + {input}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLeakyReluWorkspaceSize(infiniopLeakyReluDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET +} + +__C infiniStatus_t infiniopLeakyRelu( + infiniopLeakyReluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + float negative_slope, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, negative_slope, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyLeakyReluDescriptor(infiniopLeakyReluDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc new file mode 100644 index 000000000..97d1140c7 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc @@ -0,0 +1,55 @@ +#include "sigmoid_backward_cpu.h" + +namespace op::sigmoid_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &grad_output_desc = input_desc_vec.at(0); + const auto &x_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, x_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::cpu diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h new file mode 100644 index 000000000..500eb6473 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h @@ -0,0 +1,21 @@ +#ifndef __SIGMOID_BACKWARD_CPU_H__ +#define __SIGMOID_BACKWARD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, cpu) + +namespace op::sigmoid_backward::cpu { +typedef struct SigmoidBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &grad_output, const T &input) const { + auto sigmoid = 1 / (1 + exp(-input)); + return sigmoid * (1 - sigmoid) * grad_output; + } +} SigmoidBackwardOp; +} + +#endif // __SIGMOID_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh new file mode 100644 index 000000000..acd45fb82 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh @@ -0,0 +1,49 @@ +#ifndef __SIGMOID_BACKWARD_CUDA_H__ +#define __SIGMOID_BACKWARD_CUDA_H__ + + +namespace op::sigmoid_backward::cuda { + typedef struct SigmoidBackwardOp { + public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &grad_output, const T &input) const { + if constexpr (std::is_same_v) { + float x = __half2float(input); + float grad = __half2float(grad_output); + float sig = 1.0f / (1.0f + __expf(-x)); + float dy_dx = sig * (1.0f - sig); + return __float2half(grad * dy_dx); + } else if constexpr (std::is_same_v) { + float x = __bfloat162float(input); + float grad = __bfloat162float(grad_output); + float sig; + if (x >= 0.f) { + float z = ::expf(-x); + sig = 1.f / (1.f + z); + } else { + float z = ::expf(x); + sig = z / (1.f + z); + } + float dy_dx = sig * (1.f - sig); + return __float2bfloat16(grad * dy_dx); + } else if constexpr (std::is_same_v) { + float sig = 1.0f / (1.0f + __expf(-input)); + return grad_output * sig * (1.0f - sig); + } else if constexpr (std::is_same_v) { + double sig = 1.0 / (1.0 + ::exp(-input)); + return grad_output * sig * (1.0 - sig); + } else { + // fallback to double for other types + double x = static_cast(input); + double grad = static_cast(grad_output); + double sig = 1.0 / (1.0 + ::exp(-x)); + double dy_dx = sig * (1.0 - sig); + return static_cast(grad * dy_dx); + } + } + } SigmoidBackwardOp; +} + +#endif + diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h new file mode 100644 index 000000000..b3fdb5986 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __SIGMOID_BACKWARD_METAX_API_H__ +#define __SIGMOID_BACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, metax) + +#endif diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca new file mode 100644 index 000000000..eb3a62657 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca @@ -0,0 +1,74 @@ + +#include "sigmoid_backward_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::sigmoid_backward::metax { + +// 析构函数,默认实现 +Descriptor::~Descriptor() = default; + +// 创建sigmoid_backward算子的描述符 +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 将句柄转换为 metax 设备句柄 + auto handle = reinterpret_cast(handle_); + // 获取输出张量的数据类型 + auto dtype = out_desc->dtype(); + + // 获取输入和输出张量的描述符和形状 + const auto &grad_output_desc = input_desc_vec.at(0); + const auto &x_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + const auto &x_shape = x_desc->shape(); + + // 检查数据类型是否支持 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 检查输入输出张量形状是否一致 + CHECK_SAME_SHAPE(c_shape, x_shape, grad_output_shape); + + // 创建 CUDA 上的 elementwise 描述符 + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +// 执行sigmoid_backward计算 +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + // 检查工作空间是否足够 + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + // 根据数据类型选择对应的 CUDA kernel 进行计算 + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} + diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu new file mode 100644 index 000000000..3b9c192b9 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sigmoid_backward_nvidia.cuh" + +namespace op::sigmoid_backward::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &grad_output_desc = input_desc_vec.at(0); + const auto &x_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, x_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh new file mode 100644 index 000000000..58663caac --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIGMOID_BACKWARD_CUDA_API_H__ +#define __SIGMOID_BACKWARD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, nvidia) + +#endif diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc new file mode 100644 index 000000000..a701b8a8b --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/operator.cc @@ -0,0 +1,153 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sigmoid_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sigmoid_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sigmoid_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sigmoid_backward_metax.h" +#endif + +// 创建sigmoid_backward算子的描述符 +__C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor( + infiniopHandle_t handle, + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t grad_output, + infiniopTensorDescriptor_t input) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sigmoid_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input, \ + {grad_output, input}) + + // 根据设备类型选择对应的实现 + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); // CPU实现 +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); // NVIDIA实现 +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); // ILUVATAR实现,复用NVIDIA +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); // METAX实现 +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; // 不支持的设备类型 + } + +#undef CREATE +} + +// 获取sigmoidBackward算子的工作空间大小 +__C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型获取工作空间大小 + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +// 执行SigmoidBackward算子计算 +__C infiniStatus_t infiniopSigmoidBackward( + infiniopSigmoidBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *grad_output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {grad_output, input}, stream) + + // 根据设备类型调用对应的计算实现 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +// 销毁sigmoidBackward算子的描述符 +__C infiniStatus_t +infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + // 根据设备类型释放对应的描述符 + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} + diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc new file mode 100644 index 000000000..5d19dc49c --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc @@ -0,0 +1,58 @@ +#include "sin_cpu.h" + +namespace op::sin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 将handle_转换为device::cpu::Handle类型 + auto handle = reinterpret_cast(handle_); + // 获取输出描述符的数据类型 + auto dtype = out_desc->dtype(); + + // 获取输入描述符的形状 + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + // 检查数据类型是否为INFINI_DTYPE_F16、INFINI_DTYPE_F32、INFINI_DTYPE_F64或INFINI_DTYPE_BF16 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 检查输出描述符的形状是否与输入描述符的形状相同 + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +// 计算函数,根据数据类型调用相应的计算函数 +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::cpu \ No newline at end of file diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h new file mode 100644 index 000000000..8aaf474ea --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.h @@ -0,0 +1,20 @@ +#ifndef __SIN_CPU_H__ +#define __SIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(sin, cpu) + +namespace op::sin::cpu { +typedef struct SinOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return std::sin(x); + } +} SinOp; +} // namespace op::sin::cpu + +#endif // __SIN_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh new file mode 100644 index 000000000..19f31f39d --- /dev/null +++ b/src/infiniop/ops/sin/cuda/kernel.cuh @@ -0,0 +1,31 @@ +#ifndef __SIN_CUDA_H__ +#define __SIN_CUDA_H__ + +#include +// #include +// #include +// #include + +namespace op::sin::cuda { +typedef struct SinOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return hsin2(x); + } else if constexpr (std::is_same_v || std::is_same_v) { + return hsin(x); + } + // else if constexpr (std::is_same_v) { + // return fsin_rd(x); + // } + else { + return std::sin(x); + } + return std::sin(x); + } +} SinOp; +} // namespace op::sin::cuda + +#endif // __SIN_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h new file mode 100644 index 000000000..6dc34ea3a --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.h @@ -0,0 +1,9 @@ +#ifndef __SIN_METAX_API_H__ +#define __SIN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + + +ELEMENTWISE_DESCRIPTOR(sin, metax) + +#endif // __SIN_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca new file mode 100644 index 000000000..765b486e8 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.maca @@ -0,0 +1,77 @@ +#include "sin_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + + + + + +namespace op::sin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } +/* + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } +*/ + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::metax \ No newline at end of file diff --git a/src/infiniop/ops/sin/metax/sin_metax_op.h b/src/infiniop/ops/sin/metax/sin_metax_op.h new file mode 100644 index 000000000..2cd35f752 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax_op.h @@ -0,0 +1,16 @@ +// #ifndef __SIN_METAX_OP_H__ +// #define __SIN_METAX_OP_H__ + +// #include +// namespace op::sin::metax { +// typedef struct SinOp { +// public: +// static constexpr size_t num_inputs = 1; +// template +// __device__ __forceinline__ T operator()(const T &x) const { +// return sin(x); +// } +// } SinOp; +// } // namespace op::sin::metax + +// #endif // __SIN_METAX_OP_H__ diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu new file mode 100644 index 000000000..5a959b2eb --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sin_nvidia.cuh" + +namespace op::sin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh new file mode 100644 index 000000000..b7c7d72c4 --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIN_CUDA_API_H__ +#define __SIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sin, nvidia) + +#endif // __SIN_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc new file mode 100644 index 000000000..524734a36 --- /dev/null +++ b/src/infiniop/ops/sin/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sin_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSinDescriptor( + infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSin( + infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc new file mode 100644 index 000000000..f785f0694 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc @@ -0,0 +1,58 @@ +#include "tanh_cpu.h" + +namespace op::tanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 将handle_转换为device::cpu::Handle类型 + auto handle = reinterpret_cast(handle_); + // 获取输出描述符的数据类型 + auto dtype = out_desc->dtype(); + + // 获取输入描述符的形状 + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + // 检查数据类型是否为INFINI_DTYPE_F16、INFINI_DTYPE_F32、INFINI_DTYPE_F64或INFINI_DTYPE_BF16 + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + // 检查输出描述符的形状是否与输入描述符的形状相同 + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +// 计算函数,根据数据类型调用相应的计算函数 +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::cpu \ No newline at end of file diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h new file mode 100644 index 000000000..ba50ce423 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h @@ -0,0 +1,20 @@ +#ifndef __TANH_CPU_H__ +#define __TANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(tanh, cpu) + +namespace op::tanh::cpu { +typedef struct TanhOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return std::tanh(x); + } +} TanhOp; +} // namespace op::tanh::cpu + +#endif // __TANH_CPU_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh new file mode 100644 index 000000000..46e761012 --- /dev/null +++ b/src/infiniop/ops/tanh/cuda/kernel.cuh @@ -0,0 +1,31 @@ +#ifndef __TANH_CUDA_H__ +#define __TANH_CUDA_H__ + +#include +// #include +// #include +// #include + +namespace op::tanh::cuda { +typedef struct TanhOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + return htanh2(x); + } + // else if constexpr (std::is_same_v || std::is_same_v) { + // return htanh(x); + // } + // else if constexpr (std::is_same_v) { + // return ftanh_rd(x); + // } + else { + return std::tanh(x); + } + } +} TanhOp; +} // namespace op::tanh::cuda + +#endif // __TANH_CUDA_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h new file mode 100644 index 000000000..7e47a32a0 --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.h @@ -0,0 +1,9 @@ +#ifndef __TANH_METAX_API_H__ +#define __TANH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + + +ELEMENTWISE_DESCRIPTOR(tanh, metax) + +#endif // __TANH_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca new file mode 100644 index 000000000..8bb85adca --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca @@ -0,0 +1,64 @@ +#include "tanh_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + + + + + +namespace op::tanh::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::metax \ No newline at end of file diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu new file mode 100644 index 000000000..e66d7d00d --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tanh_nvidia.cuh" + +namespace op::tanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh new file mode 100644 index 000000000..701cfdb0c --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TANH_CUDA_API_H__ +#define __TANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tanh, nvidia) + +#endif // __TANH_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc new file mode 100644 index 000000000..980781dc5 --- /dev/null +++ b/src/infiniop/ops/tanh/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tanh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tanh_metax.h" +#endif + +__C infiniStatus_t infiniopCreateTanhDescriptor( + infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTanh( + infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc new file mode 100644 index 000000000..fb123d316 --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.cc @@ -0,0 +1,103 @@ +#include "where_cpu.h" + +namespace op::where::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + // 将handle_转换为device::cpu::Handle类型 + auto handle = reinterpret_cast(handle_); + // 获取输出描述符的数据类型 + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &condition_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &condition_shape = condition_desc->shape(); + + // CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_BOOL, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + // INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + // INFINI_DTYPE_F8, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + // INFINI_DTYPE_C128, + // INFINI_DTYPE_C16, INFINI_DTYPE_C32, INFINI_DTYPE_C64, + INFINI_DTYPE_BF16); + + CHECK_DTYPE(a_desc->dtype(), dtype); + CHECK_DTYPE(b_desc->dtype(), dtype); + CHECK_DTYPE(condition_desc->dtype(), INFINI_DTYPE_BOOL); + + // CHECK_SAME_SHAPE(output_shape, a_shape, b_shape, c_shape); + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, condition_shape); + + // 创建 CPU elementwise 描述符 + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +// 计算函数,根据数据类型调用相应的计算函数 +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_U8: + // return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_U16: + // return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_U32: + // return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_U64: + // return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_F8: + // return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_C128: + // return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_C16: + // return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_C32: + // return _device_info->calculate(_info, output, inputs, stream); + // case INFINI_DTYPE_C64: + // return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +} // namespace op::where::cpu \ No newline at end of file diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h new file mode 100644 index 000000000..b18959a61 --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.h @@ -0,0 +1,39 @@ +#ifndef __WHERE_CPU_H__ +#define __WHERE_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(where, cpu) + +namespace op::where::cpu { + +struct WhereOp { + static constexpr size_t num_inputs = 3; // a, b, condition + + // 主要的operator()函数,处理所有数据类型 + template + T operator()(const T &a_val, const T &b_val, const bool &cond) const { + return cond ? a_val : b_val; + } + + // 为Metax兼容性添加的模板operator()函数 + template + Tout operator()(const Tin&... args) const { + static_assert(sizeof...(Tin) == 3, "WhereOp expects exactly 3 arguments"); + // Metax传递的参数顺序是: [a, b, condition] + const auto& a_val = std::get<0>(std::tie(args...)); + const auto& b_val = std::get<1>(std::tie(args...)); + const bool& cond = std::get<2>(std::tie(args...)); + return cond ? a_val : b_val; + } + + // 为CPU elementwise BF16特殊处理添加的float版本 + template + T operator()(const float &a_val, const float &b_val, const bool &cond) const { + return cond ? a_val : b_val; + } +}; + +} // namespace op::where::cpu + +#endif // __WHERE_CPU_H__ diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh new file mode 100644 index 000000000..ad6d3b343 --- /dev/null +++ b/src/infiniop/ops/where/cuda/kernel.cuh @@ -0,0 +1,43 @@ +// #ifndef __WHERE_CUDA_H__ +// #define __WHERE_CUDA_H__ + +// namespace op::where::cuda { +// typedef struct WhereOp { +// public: +// static constexpr size_t num_inputs = 3; +// template +// __device__ __forceinline__ T operator()(const bool &cond,const T &a_val, const T &b_val) const { +// return cond ? a_val : b_val; +// } +// } WhereOp; +// } // namespace op::where::cuda + +// #endif // __WHERE_CUDA_H__ +#ifndef __WHERE_CUDA_H__ +#define __WHERE_CUDA_H__ + +namespace op::where::cuda { +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + + // 原有的operator()函数 + template + __device__ __forceinline__ T operator()(const bool &cond, const T &a_val, const T &b_val) const { + return cond ? a_val : b_val; + } + + // 为Metax兼容性添加的模板operator()函数 + template + __device__ __forceinline__ Tout operator()(const Tin&... args) const { + static_assert(sizeof...(Tin) == 3, "WhereOp expects exactly 3 arguments"); + const Tout& a_val = std::get<0>(std::tie(args...)); + const Tout& b_val = std::get<1>(std::tie(args...)); + const bool& cond = std::get<2>(std::tie(args...)); + return cond ? a_val : b_val; + + } +} WhereOp; +} // namespace op::where::cuda + +#endif // __WHERE_CUDA_H__ diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h new file mode 100644 index 000000000..253d2e337 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.h @@ -0,0 +1,9 @@ +#ifndef __WHERE_METAX_API_H__ +#define __WHERE_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + + +ELEMENTWISE_DESCRIPTOR(where, metax) + +#endif // __WHERE_METAX_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca new file mode 100644 index 000000000..be4f1a0d5 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.maca @@ -0,0 +1,121 @@ +#include "where_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::where::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &condition_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &condition_shape = condition_desc->shape(); + + // CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_BOOL, + INFINI_DTYPE_I8,INFINI_DTYPE_I16,INFINI_DTYPE_I32,INFINI_DTYPE_I64, + // INFINI_DTYPE_U8,INFINI_DTYPE_U16,INFINI_DTYPE_U32,INFINI_DTYPE_U64, + // INFINI_DTYPE_F8, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + // INFINI_DTYPE_C128, + // INFINI_DTYPE_C16,INFINI_DTYPE_C32,INFINI_DTYPE_C64, + INFINI_DTYPE_BF16); + + CHECK_DTYPE(a_desc->dtype(), dtype); + CHECK_DTYPE(b_desc->dtype(), dtype); + CHECK_DTYPE(condition_desc->dtype(), INFINI_DTYPE_BOOL); + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, condition_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool,bool,bool,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t,int8_t,int8_t,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t,int16_t,int16_t,bool>(_info,workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t,int32_t,int32_t,bool>(_info,workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t,int64_t,int64_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_U8: + // return _device_info->calculate<256, cuda::WhereOp, uint8_t,uint8_t,uint8_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_U16: + // return _device_info->calculate<256, cuda::WhereOp, int16_t,int16_t,int16_t,bool>(_info,workspace, output, inputs, stream); + // case INFINI_DTYPE_U32: + // return _device_info->calculate<256, cuda::WhereOp, int32_t,int32_t,int32_t,bool>(_info,workspace, output, inputs, stream); + // case INFINI_DTYPE_U64: + // return _device_info->calculate<256, cuda::WhereOp, int64_t,int64_t,int64_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_F8: + // return _device_info->calculate<256, cuda::WhereOp, fpos_t,fpos_t,fpos_t,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, fp16_t,fp16_t,fp16_t,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float,float,float,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double,double,double,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_C128: + // return _device_info->calculate<256, cuda::WhereOp, c8_t,c8_t,c8_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_C16: + // return _device_info->calculate<256, cuda::WhereOp, c16_t,c16_t,c16_t,bool>(_info,workspace, output, inputs, stream); + // case INFINI_DTYPE_C32: + // return _device_info->calculate<256, cuda::WhereOp, c32_t,c32_t,c32_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_C64: + // return _device_info->calculate<256, cuda::WhereOp, c64_t,c64_t,c64_t,bool>(_info,workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, bf16_t,bf16_t,bf16_t,bool>(_info,workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + // switch (_dtype) { + // case INFINI_DTYPE_F16: + // return _device_info->calculate<256, cuda::WhereOp, fp16_t, fp16_t, fp16_t, bool>(_info, workspace, output, inputs, stream); + // case INFINI_DTYPE_F32: + // return _device_info->calculate<256, cuda::WhereOp, float, float, float, bool>(_info, workspace, output, inputs, stream); + // case INFINI_DTYPE_F64: + // return _device_info->calculate<256, cuda::WhereOp, double, double, double, bool>(_info, workspace, output, inputs, stream); + // case INFINI_DTYPE_BF16: + // return _device_info->calculate<256, cuda::WhereOp, bf16_t, bf16_t, bf16_t, bool>(_info, workspace, output, inputs, stream); + // default: + // return INFINI_STATUS_BAD_TENSOR_DTYPE; + // } + + + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::where::metax \ No newline at end of file diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu new file mode 100644 index 000000000..b424eb3d3 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu @@ -0,0 +1,121 @@ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +#include "where_nvidia.cuh" +#include "../cuda/kernel.cuh" + + +namespace op::where::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &condition_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &condition_shape = condition_desc->shape(); + + // CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_BOOL, + INFINI_DTYPE_I8,INFINI_DTYPE_I16,INFINI_DTYPE_I32,INFINI_DTYPE_I64, + // INFINI_DTYPE_U8,INFINI_DTYPE_U16,INFINI_DTYPE_U32,INFINI_DTYPE_U64, + // INFINI_DTYPE_F8, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, + // INFINI_DTYPE_C128, + // INFINI_DTYPE_C16,INFINI_DTYPE_C32,INFINI_DTYPE_C64, + INFINI_DTYPE_BF16); + + CHECK_DTYPE(a_desc->dtype(), dtype); + CHECK_DTYPE(b_desc->dtype(), dtype); + CHECK_DTYPE(condition_desc->dtype(), INFINI_DTYPE_BOOL); + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, condition_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool,bool,bool,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t,int8_t,int8_t,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t,int16_t,int16_t,bool>(_info,workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t,int32_t,int32_t,bool>(_info,workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t,int64_t,int64_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_U8: + // return _device_info->calculate<256, cuda::WhereOp, uint8_t,uint8_t,uint8_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_U16: + // return _device_info->calculate<256, cuda::WhereOp, int16_t,int16_t,int16_t,bool>(_info,workspace, output, inputs, stream); + // case INFINI_DTYPE_U32: + // return _device_info->calculate<256, cuda::WhereOp, int32_t,int32_t,int32_t,bool>(_info,workspace, output, inputs, stream); + // case INFINI_DTYPE_U64: + // return _device_info->calculate<256, cuda::WhereOp, int64_t,int64_t,int64_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_F8: + // return _device_info->calculate<256, cuda::WhereOp, fpos_t,fpos_t,fpos_t,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, fp16_t,fp16_t,fp16_t,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float,float,float,bool>(_info, workspace,output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double,double,double,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_C128: + // return _device_info->calculate<256, cuda::WhereOp, c8_t,c8_t,c8_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_C16: + // return _device_info->calculate<256, cuda::WhereOp, c16_t,c16_t,c16_t,bool>(_info,workspace, output, inputs, stream); + // case INFINI_DTYPE_C32: + // return _device_info->calculate<256, cuda::WhereOp, c32_t,c32_t,c32_t,bool>(_info, workspace,output, inputs, stream); + // case INFINI_DTYPE_C64: + // return _device_info->calculate<256, cuda::WhereOp, c64_t,c64_t,c64_t,bool>(_info,workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, bf16_t,bf16_t,bf16_t,bool>(_info,workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + + // switch (_dtype) { + // case INFINI_DTYPE_F16: + // return _device_info->calculate<256, cuda::WhereOp, fp16_t>(_info, workspace, output, inputs, stream); + // case INFINI_DTYPE_F32: + // return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream); + // case INFINI_DTYPE_F64: + // return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream); + // case INFINI_DTYPE_BF16: + // return _device_info->calculate<256, cuda::WhereOp, bf16_t>(_info, workspace, output, inputs, stream); + // default: + // return INFINI_STATUS_BAD_TENSOR_DTYPE; + // } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::where::nvidia \ No newline at end of file diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh new file mode 100644 index 000000000..fff27ce29 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __WHERE_CUDA_API_H__ +#define __WHERE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(where, nvidia) + +#endif // __WHERE_CUDA_API_H__ \ No newline at end of file diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc new file mode 100644 index 000000000..def43d24c --- /dev/null +++ b/src/infiniop/ops/where/operator.cc @@ -0,0 +1,148 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/where.h" + +#ifdef ENABLE_CPU_API +#include "cpu/where_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/where_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/where_metax.h" +#endif + +__C infiniStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t condition_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::where::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc, \ + condition_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b, condition}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} \ No newline at end of file diff --git a/test/infiniop-test/test_generate/testcases/sin.py b/test/infiniop-test/test_generate/testcases/sin.py new file mode 100644 index 000000000..bfe941585 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/sin.py @@ -0,0 +1,97 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor + + +def sin( + input_tensor: np.ndarray, +): + return np.sin(input_tensor) + + +class SinTestCase(InfiniopTestCase): + def __init__( + self, + input_tensor: np.ndarray, + shape_input: List[int] | None, + stride_input: List[int] | None, + output: np.ndarray, + shape_output: List[int] | None, + stride_output: List[int] | None, + ): + super().__init__("sin") + self.input_tensor = input_tensor + self.shape_input = shape_input + self.stride_input = stride_input + self.output = output + self.shape_output = shape_output + self.stride_output = stride_output + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + if self.shape_input is not None: + test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_input) + if self.shape_output is not None: + test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_output) + if self.stride_input is not None: + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_input)) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides(*self.stride_output if self.stride_output is not None else contiguous_gguf_strides(self.shape_output)) + ) + test_writer.add_tensor( + test_writer.gguf_key("input"), self.input_tensor, raw_dtype=np_dtype_to_ggml(self.input_tensor.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype) + ) + ans = sin( + self.input_tensor.astype(np.float64), + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("sin.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), + ] + _TENSOR_DTYPES_ = [np.float32, np.float16] + for dtype in _TENSOR_DTYPES_: + for shape, stride_input, stride_output in _TEST_CASES_: + input_tensor = np.random.rand(*shape).astype(dtype) + output = np.empty(tuple(0 for _ in shape), dtype=dtype) + input_tensor = process_zero_stride_tensor(input_tensor, stride_input) + test_case = SinTestCase( + input_tensor=input_tensor, + shape_input=shape, + stride_input=stride_input, + output=output, + shape_output=shape, + stride_output=stride_output, + ) + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() \ No newline at end of file diff --git a/test/infiniop-test/test_generate/testcases/tanh.py b/test/infiniop-test/test_generate/testcases/tanh.py new file mode 100644 index 000000000..352dbb912 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/tanh.py @@ -0,0 +1,97 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor + + +def tanh( + input_tensor: np.ndarray, +): + return np.tanh(input_tensor) + + +class TanhTestCase(InfiniopTestCase): + def __init__( + self, + input_tensor: np.ndarray, + shape_input: List[int] | None, + stride_input: List[int] | None, + output: np.ndarray, + shape_output: List[int] | None, + stride_output: List[int] | None, + ): + super().__init__("tanh") + self.input_tensor = input_tensor + self.shape_input = shape_input + self.stride_input = stride_input + self.output = output + self.shape_output = shape_output + self.stride_output = stride_output + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + if self.shape_input is not None: + test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_input) + if self.shape_output is not None: + test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_output) + if self.stride_input is not None: + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_input)) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides(*self.stride_output if self.stride_output is not None else contiguous_gguf_strides(self.shape_output)) + ) + test_writer.add_tensor( + test_writer.gguf_key("input"), self.input_tensor, raw_dtype=np_dtype_to_ggml(self.input_tensor.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype) + ) + ans = sin( + self.input_tensor.astype(np.float64), + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("tanh.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), + ] + _TENSOR_DTYPES_ = [np.float32, np.float16] + for dtype in _TENSOR_DTYPES_: + for shape, stride_input, stride_output in _TEST_CASES_: + input_tensor = np.random.rand(*shape).astype(dtype) + output = np.empty(tuple(0 for _ in shape), dtype=dtype) + input_tensor = process_zero_stride_tensor(input_tensor, stride_input) + test_case = TanhTestCase( + input_tensor=input_tensor, + shape_input=shape, + stride_input=stride_input, + output=output, + shape_output=shape, + stride_output=stride_output, + ) + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() \ No newline at end of file diff --git a/test/infiniop-test/test_generate/testcases/where.py b/test/infiniop-test/test_generate/testcases/where.py new file mode 100644 index 000000000..9fe781e96 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/where.py @@ -0,0 +1,158 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor + + +def where_op( + condition: np.ndarray, + a: np.ndarray, + b: np.ndarray, +): + """NumPy reference implementation of where operation""" + return np.where(condition, a, b) + + +class WhereTestCase(InfiniopTestCase): + def __init__( + self, + condition: np.ndarray, + shape_condition: List[int] | None, + stride_condition: List[int] | None, + a: np.ndarray, + shape_a: List[int] | None, + stride_a: List[int] | None, + b: np.ndarray, + shape_b: List[int] | None, + stride_b: List[int] | None, + c: np.ndarray, + shape_c: List[int] | None, + stride_c: List[int] | None, + ): + super().__init__("where") + self.condition = condition + self.shape_condition = shape_condition + self.stride_condition = stride_condition + self.a = a + self.shape_a = shape_a + self.stride_a = stride_a + self.b = b + self.shape_b = shape_b + self.stride_b = stride_b + self.c = c + self.shape_c = shape_c + self.stride_c = stride_c + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + + # Write condition tensor info + if self.shape_condition is not None: + test_writer.add_array(test_writer.gguf_key("condition.shape"), self.shape_condition) + if self.stride_condition is not None: + test_writer.add_array(test_writer.gguf_key("condition.strides"), gguf_strides(*self.stride_condition)) + test_writer.add_tensor( + test_writer.gguf_key("condition"), self.condition, raw_dtype=np_dtype_to_ggml(self.condition.dtype) + ) + + # Write tensor a info + if self.shape_a is not None: + test_writer.add_array(test_writer.gguf_key("a.shape"), self.shape_a) + if self.stride_a is not None: + test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_tensor( + test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype) + ) + + # Write tensor b info + if self.shape_b is not None: + test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b) + if self.stride_b is not None: + test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_tensor( + test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype) + ) + + # Write output tensor c info + if self.shape_c is not None: + test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) + test_writer.add_array( + test_writer.gguf_key("c.strides"), + gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + ) + test_writer.add_tensor( + test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype) + ) + + # Calculate expected result + ans = where_op(self.condition, self.a, self.b) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(ans.dtype) + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("where_op.gguf") + test_cases = [] + + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride, condition_stride + ((13, 4), None, None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None, None), + ((13, 4, 4), None, None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None, None), + ((16, 5632), None, None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), + ] + + # Support all legal data types + _TENSOR_DTYPES_ = [np.bool_, np.int8, np.int16, np.int32, np.int64, + np.float32, np.float16, np.float64, + np.uint8, np.uint16, np.uint32, np.uint64, + np.bfloat16] + # _TENSOR_DTYPES_ = [np.float32, np.float16, np.float64, np.bfloat16] + + for dtype in _TENSOR_DTYPES_: + for shape, stride_a, stride_b, stride_c, stride_condition in _TEST_CASES_: + # Create condition tensor (always bool) + condition = np.random.choice([True, False], size=shape) + + # Create input tensors with specified dtype + a = np.random.rand(*shape).astype(dtype) + b = np.random.rand(*shape).astype(dtype) + c = np.empty(shape, dtype=dtype) + + # Process strides + condition = process_zero_stride_tensor(condition, stride_condition) + a = process_zero_stride_tensor(a, stride_a) + b = process_zero_stride_tensor(b, stride_b) + + test_case = WhereTestCase( + condition=condition, + shape_condition=shape, + stride_condition=stride_condition, + a=a, + shape_a=shape, + stride_a=stride_a, + b=b, + shape_b=shape, + stride_b=stride_b, + c=c, + shape_c=shape, + stride_c=stride_c, + ) + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop/cast-gguf.py b/test/infiniop/cast-gguf.py new file mode 100644 index 000000000..401f71328 --- /dev/null +++ b/test/infiniop/cast-gguf.py @@ -0,0 +1,192 @@ +import torch +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto +from gguf import GGUFReader + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +FLOAT_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64] +INTER_DTYPES = [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U32, InfiniDtype.U64] + +_TENSOR_DTYPES = [ + (ftype, ttype) for ftype in FLOAT_DTYPES + INTER_DTYPES for ttype in FLOAT_DTYPES +] + [ + (ftype, ttype) for ftype in INTER_DTYPES for ttype in INTER_DTYPES +] + +_TOLERANCE_MAP = { + (ftype, ttype): {"atol": 1e-3, "rtol": 1e-3} + for ftype in FLOAT_DTYPES + INTER_DTYPES + for ttype in FLOAT_DTYPES +} +_TOLERANCE_MAP.update({ + (ftype, ttype): {"atol": 0, "rtol": 0} + for ftype in INTER_DTYPES + for ttype in INTER_DTYPES +}) +_TOLERANCE_MAP.update({ + (InfiniDtype.F64, InfiniDtype.F16): {"atol": 1e-3, "rtol": 1e-3}, + (InfiniDtype.I16, InfiniDtype.I32): {"atol": 1e-3, "rtol": 1e-3} +}) + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def cast(c: torch.Tensor, x: torch.Tensor): + if not x.device.type.startswith('cpu') and c.dtype in [torch.uint32, torch.uint64]: + x_np = x.cpu().numpy() + if c.dtype == torch.uint32: + c_np = x_np.astype('uint32') + elif c.dtype == torch.uint64: + c_np = x_np.astype('uint64') + c.copy_(torch.from_numpy(c_np)) + else: + c.copy_(x.to(c.dtype)) + +def load_test_cases_from_gguf(filepath): + reader = GGUFReader(filepath) + tensors = reader.tensors + + test_cases = [] + for tensor in tensors: + data = tensor.data + shape = data.shape + torch_tensor = torch.from_numpy(data.copy()) + x_stride = torch_tensor.stride() + c_stride = None + + for inplace in [Inplace.OUT_OF_PLACE, Inplace.INPLACE_X]: + test_cases.append((shape, x_stride, c_stride, inplace, torch_tensor)) + + return test_cases + +def test( + handle, + device, + shape, + x_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + torch_tensor=None, + dtype=(InfiniDtype.F32, InfiniDtype.F64), + sync=None, +): + x = TestTensor(shape, x_stride, dtype[0], device, mode="manual", set_tensor=torch_tensor) + if inplace == Inplace.INPLACE_X: + if x_stride != c_stride: + return + c = x + else: + c = TestTensor(shape, c_stride, dtype[1], device) + + print( + f"Testing Cast on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} c_stride:{c_stride} " + f"dtype from {InfiniDtypeNames[dtype[0]]} to {InfiniDtypeNames[dtype[1]]} inplace={inplace}" + ) + + cast(c.torch_tensor(), x.torch_tensor()) + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCastDescriptor( + handle, ctypes.byref(descriptor), c.descriptor, x.descriptor + ) + ) + + for tensor in [x, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCastWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_cast(): + check_error( + LIBINFINIOP.infiniopCast( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + x.data(), + None + ) + ) + + lib_cast() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + + actual = c.actual_tensor() + expected = c.torch_tensor() + + if DEBUG: + debug(actual, expected, atol=atol, rtol=rtol) + + if expected.dtype in [torch.float16, torch.float32, torch.float64]: + assert torch.allclose(actual, expected, atol=atol, rtol=rtol) + else: + assert torch.equal(actual, expected), f"Integer cast mismatch!\nExpected:\n{expected}\nActual:\n{actual}" + + if PROFILE: + profile_operation("PyTorch", lambda: cast(c.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lib_cast, device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyCastDescriptor(descriptor)) + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # gguf 文件路径你可以根据实际改 + _TEST_CASES = { + InfiniDtype.F16: load_test_cases_from_gguf("T1-1-1/cast/cast_f16.gguf"), + InfiniDtype.F32: load_test_cases_from_gguf("T1-1-1/cast/cast_f32.gguf"), + InfiniDtype.F64: load_test_cases_from_gguf("T1-1-1/cast/cast_float64.gguf"), + InfiniDtype.I32: load_test_cases_from_gguf("T1-1-1/cast/cast_i32.gguf"), + InfiniDtype.I64: load_test_cases_from_gguf("T1-1-1/cast/cast_i64.gguf"), + InfiniDtype.U32: load_test_cases_from_gguf("T1-1-1/cast/cast_u32.gguf"), + InfiniDtype.U64: load_test_cases_from_gguf("T1-1-1/cast/cast_u64.gguf"), + } + + for device in get_test_devices(args): + for ftype in _TEST_CASES: + if ftype in FLOAT_DTYPES: + test_operator(device, test, _TEST_CASES[ftype], [(f, t) for f in FLOAT_DTYPES for t in FLOAT_DTYPES]) + else: + test_operator(device, test, _TEST_CASES[ftype], [(f, t) for f in INTER_DTYPES for t in FLOAT_DTYPES+INTER_DTYPES if t not in [InfiniDtype.U64, InfiniDtype.U32] and f not in [InfiniDtype.U64, InfiniDtype.U32]]) + + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cast.py b/test/infiniop/cast.py new file mode 100644 index 000000000..fa7ec2224 --- /dev/null +++ b/test/infiniop/cast.py @@ -0,0 +1,244 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# cast是单输入算子,测试用例包含:形状、输入x的步长、输出c的步长 +_TEST_CASES_ = [ + # shape, x_stride, c_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() # 输出使用新内存 + INPLACE_X = auto() # 输出复用输入x的内存(原地操作) + + +FLOAT_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +INTER_DTYPES = [ + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.U32, + InfiniDtype.U64, +] + +# 测试支持的数据类型, 全部类型转换为浮点类型: +# 浮点类型转换为浮点类型,整数类型转换为浮点类型 +_TENSOR_DTYPES = [ + (ftype, ttype) for ftype in FLOAT_DTYPES + INTER_DTYPES + for ttype in FLOAT_DTYPES +] + +# 整数类型之间相互转换 +_TENSOR_DTYPES.extend([ + (ftype, ttype) for ftype in INTER_DTYPES + for ttype in INTER_DTYPES +]) + +# _TENSOR_DTYPES = [ +# # 输入类型 输出类型 +# (InfiniDtype.F16, InfiniDtype.F16), +# (InfiniDtype.F32, InfiniDtype.F16), +# (InfiniDtype.F64, InfiniDtype.F16), +# (InfiniDtype.I32, InfiniDtype.F16), +# (InfiniDtype.I64, InfiniDtype.F16), +# (InfiniDtype.U32, InfiniDtype.F16), +# (InfiniDtype.U64, InfiniDtype.F16), + +# # (InfiniDtype.F16, InfiniDtype.F32), +# # (InfiniDtype.F64, InfiniDtype.F32), +# # (InfiniDtype.I32, InfiniDtype.F32), +# # (InfiniDtype.I64, InfiniDtype.F32), +# # …… +# ] + +# 不同数据类型的误差容限 +_TOLERANCE_MAP = { + (ftype, ttype): {"atol": 1e-3, "rtol": 1e-3} + for ftype in FLOAT_DTYPES + INTER_DTYPES + for ttype in FLOAT_DTYPES +} + +# 添加整数类型之间的转换及误差 +_TOLERANCE_MAP.update({ + (ftype, ttype): {"atol": 0, "rtol": 0} + for ftype in INTER_DTYPES + for ttype in INTER_DTYPES +}) + +# 特别处理 F64 浮点类型转换为 F16 浮点类型 +_TOLERANCE_MAP.update({ + (InfiniDtype.F64, InfiniDtype.F16): {"atol": 1e-3, "rtol": 1e-3} +}) + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +# 方案一:bug在于 GPU 上不支持 UInt32/UInt64 的 copy_ +# def cast(c: torch.Tensor, x: torch.Tensor): +# """ +# PyTorch参考实现Cast +# c: 输出张量 +# x: 输入张量 +# dtype: torch数据类型(如 torch.float32, torch.int32) +# """ +# # 在 CPU 上进行参考计算,确保类型和行为一致 +# # 打印x的设备信息 ['cuda:0', 'cpu'] +# if not x.device.type.startswith('cpu') and c.dtype in [torch.uint32, torch.uint64]: +# x = x.cpu().to(c.dtype) +# c.copy_(x.to(c.device)) + +# 方案二: +# 避开了 GPU UInt32/UInt64 的限制 +# 直接在 CPU 上用 NumPy 做类型转换,兼容非连续张量 +def cast(c: torch.Tensor, x: torch.Tensor): + if not x.device.type.startswith('cpu') and c.dtype in [torch.uint32, torch.uint64]: + x_np = x.cpu().numpy() + if c.dtype == torch.uint32: + c_np = x_np.astype('uint32') + elif c.dtype == torch.uint64: + c_np = x_np.astype('uint64') + c.copy_(torch.from_numpy(c_np)) + else: + c.copy_(x.to(c.dtype)) + + +def test( + handle, + device, + shape, + x_stride=None, + c_stride=None, + dtype=(InfiniDtype.F32, InfiniDtype.F64), + sync=None, +): + # 创建输入张量x + + x = TestTensor(shape, x_stride, dtype[0], device) + c = TestTensor(shape, c_stride, dtype[1], device) + + # 打印测试信息 + print( + f"Testing Cast on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} c_stride:{c_stride} " + f"Cast dtype from {InfiniDtypeNames[dtype[0]]} to {InfiniDtypeNames[dtype[1]]}" + ) + # 用PyTorch计算参考结果 + cast(c.torch_tensor(), x.torch_tensor()) + if sync is not None: + sync() + + # 创建cast算子描述符 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCastDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, # 输出张量描述符 + x.descriptor # 输入张量描述符(单输入) + ) + ) + + # 销毁张量描述符缓存(模拟实际场景) + for tensor in [x, c]: + tensor.destroy_desc() + + # 分配工作空间 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCastWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + # 定义自定义库的cast调用函数 + def lib_cast(): + check_error( + LIBINFINIOP.infiniopCast( + descriptor, + workspace.data(), + workspace.size(), + c.data(), # 输出数据地址 + x.data(), # 输入数据地址(单输入) + None # 额外参数 + ) + ) + + # 执行自定义库的cast算子 + lib_cast() + + # 验证结果正确性 + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + actual = c.actual_tensor() + expected = c.torch_tensor() + if expected.dtype in [torch.float16, torch.float32, torch.float64]: + assert torch.allclose(actual, expected, atol=atol, rtol=rtol) + else: + assert torch.equal(actual, expected), \ + f"Integer cast mismatch!\nExpected:\n{expected}\nActual:\n{actual}" + + # 性能 profiling(对比自定义库与PyTorch性能) + if PROFILE: + profile_operation("PyTorch", lambda: cast(c.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cast(), device, NUM_PRERUN, NUM_ITERATIONS) + + # 销毁算子描述符 + check_error(LIBINFINIOP.infiniopDestroyCastDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # # 解析命令行参数配置测试 + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + # 在所有测试设备上执行测试 + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/cos-gguf.py b/test/infiniop/cos-gguf.py new file mode 100644 index 000000000..06360c33f --- /dev/null +++ b/test/infiniop/cos-gguf.py @@ -0,0 +1,163 @@ +import torch +import ctypes +from ctypes import c_uint64 +from gguf import GGUFReader +from enum import Enum, auto + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +# 支持的数据类型 +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# PyTorch参考实现 +def cos(output, input): + if output.shape != input.shape: + output.resize_(input.shape) + torch.cos(input, out=output) + +# 从 gguf 文件加载测试用例 +def load_test_cases_from_gguf(filepath): + reader = GGUFReader(filepath) + tensors = reader.tensors + + test_cases = [] + for tensor in tensors: + data = tensor.data + shape = data.shape + torch_tensor = torch.from_numpy(data.copy()) + x_stride = torch_tensor.stride() + c_stride = None + + for inplace in [Inplace.OUT_OF_PLACE, Inplace.INPLACE_X]: + test_cases.append((shape, x_stride, c_stride, inplace, torch_tensor)) + + return test_cases + +def test( + handle, + device, + shape, + x_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + torch_tensor=None, + dtype=torch.float16, + sync=None, +): + x = TestTensor(shape, x_stride, dtype, device, mode="manual", set_tensor=torch_tensor) + if inplace == Inplace.INPLACE_X: + # if x_stride != c_stride: + # return + c = x + else: + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} " + f"c_stride:{c_stride} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + cos(c.torch_tensor(), x.torch_tensor()) + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCosDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + x.descriptor + ) + ) + + for tensor in [x, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error(LIBINFINIOP.infiniopGetCosWorkspaceSize(descriptor, ctypes.byref(workspace_size))) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_cos(): + check_error( + LIBINFINIOP.infiniopCos( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + x.data(), + None + ) + ) + + lib_cos() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + actual = c.actual_tensor() + expected = c.torch_tensor() + if DEBUG: + debug(actual, expected, atol=atol, rtol=rtol) + + assert torch.allclose(actual, expected, atol=atol, rtol=rtol) + + if PROFILE: + profile_operation("PyTorch", lambda: cos(c.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lib_cos, device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor)) + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # gguf 文件路径示例,按实际情况修改 + _TEST_CASES = { + InfiniDtype.F16: load_test_cases_from_gguf("T1-1-1/cos/cos_bf16.gguf"), + InfiniDtype.F32: load_test_cases_from_gguf("T1-1-1/cos/cos_f32.gguf"), + InfiniDtype.BF16: load_test_cases_from_gguf("T1-1-1/cos/cos_bf16.gguf"), + } + + + for device in get_test_devices(args): + for dtype in _TEST_CASES: + test_operator(device, test, _TEST_CASES[dtype], [dtype]) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py new file mode 100644 index 000000000..37d0e1ba0 --- /dev/null +++ b/test/infiniop/cos.py @@ -0,0 +1,191 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# cos是单输入算子,测试用例包含:形状、输入x的步长、输出c的步长 +_TEST_CASES_ = [ + # shape, x_stride, c_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() # 输出使用新内存 + INPLACE_X = auto() # 输出复用输入x的内存(原地操作) + + +# 为每个测试用例附加inplace选项 +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# 组合测试用例:形状+步长+inplace模式 +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# 测试支持的数据类型 +_TENSOR_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.BF16 +] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cos(output, input): + """PyTorch的Cos参考实现""" + if output.shape != input.shape: + output.resize_(input.shape) + torch.cos(input, out=output) + + +def test( + handle, + device, + shape, + x_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # 创建输入张量x + x = TestTensor(shape, x_stride, dtype, device) + # 根据inplace模式创建输出张量c + if inplace == Inplace.INPLACE_X: + # 原地操作:c复用x的内存(需步长匹配) + if x_stride != c_stride: + return + c = x + else: + # 非原地操作:c使用新内存 + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + # 跳过广播场景(如需支持广播可移除) + if c.is_broadcast(): + return + + # 打印测试信息 + print( + f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # 用PyTorch计算参考结果 + cos(c.torch_tensor(), x.torch_tensor()) + + if sync is not None: + sync() + + # 创建cos算子描述符 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCosDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, # 输出张量描述符 + x.descriptor # 输入张量描述符(单输入) + ) + ) + + # 销毁张量描述符缓存(模拟实际场景) + for tensor in [x, c]: + tensor.destroy_desc() + + # 分配工作空间 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCosWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + # 定义自定义库的cos调用函数 + def lib_cos(): + check_error( + LIBINFINIOP.infiniopCos( + descriptor, + workspace.data(), + workspace.size(), + c.data(), # 输出数据地址 + x.data(), # 输入数据地址(单输入) + None # 额外参数 + ) + ) + + # 执行自定义库的cos算子 + lib_cos() + # 验证结果正确性 + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + + # 性能 profiling(对比自定义库与PyTorch性能) + if PROFILE: + profile_operation("PyTorch", lambda: cos(c.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS) + + # 销毁算子描述符 + check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # # 解析命令行参数配置测试 + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + # 在所有测试设备上执行测试 + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") + \ No newline at end of file diff --git a/test/infiniop/exp-gguf.py b/test/infiniop/exp-gguf.py new file mode 100644 index 000000000..62dc00e21 --- /dev/null +++ b/test/infiniop/exp-gguf.py @@ -0,0 +1,193 @@ +import torch +import ctypes +from ctypes import c_uint64 +from gguf import GGUFReader + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== + + +class Inplace(Enum): + OUT_OF_PLACE = auto() # 输出使用新内存 + INPLACE_X = auto() # 输出复用输入x的内存(原地操作) + + +# 为每个测试用例附加inplace选项 +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +def load_test_cases_from_gguf(filepath): + """从 gguf 文件读取 tensors,生成测试用例""" + reader = GGUFReader(filepath) + tensors = reader.tensors + + test_cases = [] + for tensor in tensors: + # shape = tuple(int(s) for s in tensor.shape) + data = tensor.data # NumPy array + shape = data.shape + # 转换为 PyTorch tensor(默认 float32,后面 test() 中根据 dtype 自动转换) + torch_tensor = torch.from_numpy(data.copy()) # 必须 .copy() 防止 memory alias + x_stride = torch_tensor.stride() + c_stride = None + + for inplace_item in _INPLACE: + test_cases.append((shape, x_stride, c_stride, inplace_item, torch_tensor)) + + return test_cases + +# 不同数据类型的误差容限 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def exp(c, x): + """用PyTorch的exp作为参考实现""" + if c.shape != x.shape: + c.resize_(0) + torch.exp(x, out=c) + + +def test( + handle, + device, + shape, + x_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + torch_tensor=None, + dtype=torch.float16, + sync=None, +): + # print(shape, x_stride, dtype, device, torch_tensor) + # 创建输入张量x + x = TestTensor(shape, x_stride, dtype, device, mode='manual',set_tensor=torch_tensor) + # 根据inplace模式创建输出张量c + if inplace == Inplace.INPLACE_X: + # 原地操作:c复用x的内存(需步长匹配) + c = x + else: + # 非原地操作:c使用新内存 + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + # 跳过广播场景(如需支持广播可移除) + if c.is_broadcast(): + return + + # 打印测试信息 + print( + f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + # 用PyTorch计算参考结果 + exp(c.torch_tensor(), x.torch_tensor()) + + if sync is not None: + sync() + + # 创建exp算子描述符 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateExpDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, # 输出张量描述符 + x.descriptor # 输入张量描述符(单输入) + ) + ) + + # 销毁张量描述符缓存(模拟实际场景) + for tensor in [x, c]: + tensor.destroy_desc() + + # 分配工作空间 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetExpWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + # 定义自定义库的exp调用函数 + def lib_exp(): + check_error( + LIBINFINIOP.infiniopExp( + descriptor, + workspace.data(), + workspace.size(), + c.data(), # 输出数据地址 + x.data(), # 输入数据地址(单输入) + None # 额外参数 + ) + ) + + # 执行自定义库的exp算子 + lib_exp() + # 验证结果正确性 + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + + # 性能 profiling(对比自定义库与PyTorch性能) + if PROFILE: + profile_operation("PyTorch", lambda: exp(c.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS) + + # 销毁算子描述符 + check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # # 解析命令行参数配置测试 + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + _TEST_CASES = { + InfiniDtype.F16: load_test_cases_from_gguf("T1-1-1/exp/exp_bf16.gguf"), + InfiniDtype.F32: load_test_cases_from_gguf("T1-1-1/exp/exp_f32.gguf"), + InfiniDtype.BF16: load_test_cases_from_gguf("T1-1-1/exp/exp_bf16.gguf"), + } + + + # 在所有测试设备上执行测试 + for device in get_test_devices(args): + for key in _TEST_CASES: + test_operator(device, test, _TEST_CASES[key], [key]) + + print("\033[92mTest passed!\033[0m") + diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py new file mode 100644 index 000000000..7e20d8d14 --- /dev/null +++ b/test/infiniop/exp.py @@ -0,0 +1,186 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# exp是单输入算子,测试用例包含:形状、输入x的步长、输出c的步长 +_TEST_CASES_ = [ + # shape, x_stride, c_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), # 测试输入广播/重复场景 + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() # 输出使用新内存 + INPLACE_X = auto() # 输出复用输入x的内存(原地操作) + + +# 为每个测试用例附加inplace选项 +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# 组合测试用例:形状+步长+inplace模式 +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# 测试支持的数据类型 +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# 不同数据类型的误差容限 +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-7, "rtol": 1e-7}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def exp(c, x): + """用PyTorch的exp作为参考实现""" + if c.shape != x.shape: + c.resize_(0) + torch.exp(x, out=c) + + +def test( + handle, + device, + shape, + x_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # 创建输入张量x + x = TestTensor(shape, x_stride, dtype, device) + # 根据inplace模式创建输出张量c + if inplace == Inplace.INPLACE_X: + # 原地操作:c复用x的内存(需步长匹配) + if x_stride != c_stride: + return + c = x + else: + # 非原地操作:c使用新内存 + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + # 跳过广播场景(如需支持广播可移除) + if c.is_broadcast(): + return + + # 打印测试信息 + print( + f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + # 用PyTorch计算参考结果 + exp(c.torch_tensor(), x.torch_tensor()) + + if sync is not None: + sync() + + # 创建exp算子描述符 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateExpDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, # 输出张量描述符 + x.descriptor # 输入张量描述符(单输入) + ) + ) + + # 销毁张量描述符缓存(模拟实际场景) + for tensor in [x, c]: + tensor.destroy_desc() + + # 分配工作空间 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetExpWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + # 定义自定义库的exp调用函数 + def lib_exp(): + check_error( + LIBINFINIOP.infiniopExp( + descriptor, + workspace.data(), + workspace.size(), + c.data(), # 输出数据地址 + x.data(), # 输入数据地址(单输入) + None # 额外参数 + ) + ) + + # 执行自定义库的exp算子 + lib_exp() + # 验证结果正确性 + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + + # 性能 profiling(对比自定义库与PyTorch性能) + if PROFILE: + profile_operation("PyTorch", lambda: exp(c.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS) + + # 销毁算子描述符 + check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # # 解析命令行参数配置测试 + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + # 在所有测试设备上执行测试 + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") + \ No newline at end of file diff --git a/test/infiniop/hard_swish.py b/test/infiniop/hard_swish.py new file mode 100644 index 000000000..0f201d336 --- /dev/null +++ b/test/infiniop/hard_swish.py @@ -0,0 +1,189 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# hard_swish是单输入算子,测试用例包含:形状、输入x的步长、输出c的步长 +_TEST_CASES_ = [ + # shape, x_stride, c_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() # 输出使用新内存 + INPLACE_X = auto() # 输出复用输入x的内存(原地操作) + + +# 为每个测试用例附加inplace选项 +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# 组合测试用例:形状+步长+inplace模式 +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# 测试支持的数据类型 +_TENSOR_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.BF16 +] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def hard_swish(c, x): + """PyTorch的HardSwish参考实现""" + c.copy_(torch.nn.functional.hardswish(x)) + + +def test( + handle, + device, + shape, + x_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # 创建输入张量x + x = TestTensor(shape, x_stride, dtype, device) + # 根据inplace模式创建输出张量c + if inplace == Inplace.INPLACE_X: + # 原地操作:c复用x的内存(需步长匹配) + if x_stride != c_stride: + return + c = x + else: + # 非原地操作:c使用新内存 + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + # 跳过广播场景(如需支持广播可移除) + if c.is_broadcast(): + return + + # 打印测试信息 + print( + f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # 用PyTorch计算参考结果 + hard_swish(c.torch_tensor(), x.torch_tensor()) + + if sync is not None: + sync() + + # 创建hard_swish算子描述符 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateHardSwishDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, # 输出张量描述符 + x.descriptor # 输入张量描述符(单输入) + ) + ) + + # 销毁张量描述符缓存(模拟实际场景) + for tensor in [x, c]: + tensor.destroy_desc() + + # 分配工作空间 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardSwishWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + # 定义自定义库的hard_swish调用函数 + def lib_hard_swish(): + check_error( + LIBINFINIOP.infiniopHardSwish( + descriptor, + workspace.data(), + workspace.size(), + c.data(), # 输出数据地址 + x.data(), # 输入数据地址(单输入) + None # 额外参数 + ) + ) + + # 执行自定义库的hard_swish算子 + lib_hard_swish() + # 验证结果正确性 + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + + # 性能 profiling(对比自定义库与PyTorch性能) + if PROFILE: + profile_operation("PyTorch", lambda: hard_swish(c.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hard_swish(), device, NUM_PRERUN, NUM_ITERATIONS) + + # 销毁算子描述符 + check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + # # 解析命令行参数配置测试 + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + # 在所有测试设备上执行测试 + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") + \ No newline at end of file diff --git a/test/infiniop/leaky_relu.py b/test/infiniop/leaky_relu.py new file mode 100644 index 000000000..8dec3bf9d --- /dev/null +++ b/test/infiniop/leaky_relu.py @@ -0,0 +1,220 @@ +import torch +import ctypes +from ctypes import c_uint64, c_float +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration for Leaky ReLU Testing +# ============================================================================== +# Test cases: shape, input_stride, output_stride, negative_slope +_TEST_CASES_ = [ + # shape, input_stride, output_stride, negative_slope + ((16,), None, None, 0.01), + ((13, 4), None, None, 0.02), + ((13, 4), (10, 1), (10, 1), 0.03), + ((13, 4), (0, 1), None, 0.04), + ((13, 4, 4), None, None, 0.01), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), 0.01), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), 0.01), + ((16, 5632), None, None, 0.01), + ((16, 5632), (13312, 1), (13312, 1), 0.01), + ((4, 4, 5632), None, None, 0.01), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), 0.01), + ((4,), None, None, 0.01), + ((10,), (1,), (1,), 0.5), + ((3, 3, 3), None, None, 0.0), + ((2, 4), None, None, 1.0), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [ + # Metax下InfiniDtype.F16会报错(CPU的可以正常测试), 暂时注释掉 + # InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.BF16 +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = True +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def leaky_relu(output, input, negative_slope): + output.copy_(torch.nn.functional.leaky_relu(input, negative_slope, inplace=False)) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + negative_slope=0.01, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # Create input tensor + input_tensor = TestTensor(shape, input_stride, dtype, device) + # Handle in-place vs out-of-place + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return # Skip incompatible strides + output_tensor = input_tensor + else: + output_tensor = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output_tensor.is_broadcast(): + return # Skip broadcasted outputs + + print( + f"Testing LeakyReLU on {InfiniDeviceNames[device]} with shape:{shape} " + f"input_stride:{input_stride} output_stride:{output_stride} " + f"negative_slope:{negative_slope} dtype:{InfiniDtypeNames[dtype]} " + f"slope:{negative_slope} inplace:{inplace}" + ) + + # Compute reference result using PyTorch + leaky_relu( + output_tensor.torch_tensor(), + input_tensor.torch_tensor(), + negative_slope + ) + + + if sync is not None: + sync() + + # Create LeakyReLU descriptor + descriptor = infiniopOperatorDescriptor_t() + + check_error( + LIBINFINIOP.infiniopCreateLeakyReluDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor + ) + ) + # Invalidate the shape and strides in the descriptor + input_tensor.destroy_desc() + if inplace == Inplace.OUT_OF_PLACE: + output_tensor.destroy_desc() + + # Get workspace size and allocate + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLeakyReluWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + # Define function to call the library implementation + def lib_leaky_relu(): + check_error( + LIBINFINIOP.infiniopLeakyRelu( + descriptor, + workspace.data(), + workspace.size(), + output_tensor.data(), + input_tensor.data(), + negative_slope, + None, + ) + ) + # Run the library implementation + lib_leaky_relu() + + # Verify results + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output_tensor.actual_tensor(), output_tensor.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol + ) + + # Profiling + if PROFILE: + profile_operation( + "PyTorch", + lambda: leaky_relu( + output_tensor.torch_tensor(), + input_tensor.torch_tensor(), + negative_slope + ), + device, + NUM_PRERUN, + NUM_ITERATIONS + ) + profile_operation( + " lib", + lambda: lib_leaky_relu(), + device, + NUM_PRERUN, + NUM_ITERATIONS + ) + # Clean up + check_error(LIBINFINIOP.infiniopDestroyLeakyReluDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mLeakyReLU test passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..30d186c31 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -96,6 +96,37 @@ def attention_(lib): ] +@OpRegister.operator +def cast_(lib): + lib.infiniopCreateCastDescriptor.restype = c_int32 + lib.infiniopCreateCastDescriptor.argtypes = [ + infiniopHandle_t, # 句柄 + POINTER(infiniopOperatorDescriptor_t), # 输出:算子描述符指针 + infiniopTensorDescriptor_t, # 输出张量描述符 + infiniopTensorDescriptor_t # 输入张量描述符 + ] + + lib.infiniopGetCastWorkspaceSize.restype = c_int32 + lib.infiniopGetCastWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + POINTER(c_size_t) # 输出:工作空间大小指针 + ] + + lib.infiniopCast.restype = c_int32 + lib.infiniopCast.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + c_void_p, # 工作空间地址 + c_size_t, # 工作空间大小 + c_void_p, # 输出数据地址 + c_void_p, # 输入数据地址 + c_void_p # 额外参数(通常为nullptr) + ] + + lib.infiniopDestroyCastDescriptor.restype = c_int32 + lib.infiniopDestroyCastDescriptor.argtypes = [ + infiniopOperatorDescriptor_t # 算子描述符 + ] + @OpRegister.operator def causal_softmax_(lib): lib.infiniopCreateCausalSoftmaxDescriptor.restype = c_int32 @@ -162,10 +193,80 @@ def clip_(lib): ] +# @OpRegister.operator +# def conv_(lib): +# pass + + @OpRegister.operator -def conv_(lib): - pass +def cos_(lib): + lib.infiniopCreateCosDescriptor.restype = c_int32 + lib.infiniopCreateCosDescriptor.argtypes = [ + infiniopHandle_t, # 句柄 + POINTER(infiniopOperatorDescriptor_t), # 输出:算子描述符指针 + infiniopTensorDescriptor_t, # 输出张量描述符 + infiniopTensorDescriptor_t # 输入张量描述符(cos为单输入) + ] + + # 2. 获取cos算子工作空间大小的函数 + lib.infiniopGetCosWorkspaceSize.restype = c_int32 + lib.infiniopGetCosWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + POINTER(c_size_t) # 输出:工作空间大小指针 + ] + + # 3. 执行cos算子的函数 + lib.infiniopCos.restype = c_int32 + lib.infiniopCos.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + c_void_p, # 工作空间地址 + c_size_t, # 工作空间大小 + c_void_p, # 输出数据地址 + c_void_p, # 输入数据地址(cos为单输入) + c_void_p # 额外参数(通常为nullptr) + ] + + # 4. 销毁cos算子描述符的函数 + lib.infiniopDestroyCosDescriptor.restype = c_int32 + lib.infiniopDestroyCosDescriptor.argtypes = [ + infiniopOperatorDescriptor_t # 算子描述符 + ] + + +@OpRegister.operator +def exp_(lib): + # 1. 创建exp算子描述符的函数 + lib.infiniopCreateExpDescriptor.restype = c_int32 + lib.infiniopCreateExpDescriptor.argtypes = [ + infiniopHandle_t, # 句柄 + POINTER(infiniopOperatorDescriptor_t), # 输出:算子描述符指针 + infiniopTensorDescriptor_t, # 输出张量描述符 + infiniopTensorDescriptor_t # 输入张量描述符(exp为单输入) + ] + + # 2. 获取exp算子工作空间大小的函数 + lib.infiniopGetExpWorkspaceSize.restype = c_int32 + lib.infiniopGetExpWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + POINTER(c_size_t) # 输出:工作空间大小指针 + ] + # 3. 执行exp算子的函数 + lib.infiniopExp.restype = c_int32 + lib.infiniopExp.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + c_void_p, # 工作空间地址 + c_size_t, # 工作空间大小 + c_void_p, # 输出数据地址 + c_void_p, # 输入数据地址(exp为单输入) + c_void_p # 额外参数(通常为nullptr) + ] + + # 4. 销毁exp算子描述符的函数 + lib.infiniopDestroyExpDescriptor.restype = c_int32 + lib.infiniopDestroyExpDescriptor.argtypes = [ + infiniopOperatorDescriptor_t # 算子描述符 + ] @OpRegister.operator def gemm_(lib): @@ -203,6 +304,79 @@ def gemm_(lib): ] + +@OpRegister.operator +def hard_swish_(lib): + # 1. 创建hard_swish算子描述符的函数 + lib.infiniopCreateHardSwishDescriptor.restype = c_int32 + lib.infiniopCreateHardSwishDescriptor.argtypes = [ + infiniopHandle_t, # 句柄 + POINTER(infiniopOperatorDescriptor_t), # 输出:算子描述符指针 + infiniopTensorDescriptor_t, # 输出张量描述符 + infiniopTensorDescriptor_t # 输入张量描述符(hard_swish为单输入) + ] + + # 2. 获取hard_swish算子工作空间大小的函数 + lib.infiniopGetHardSwishWorkspaceSize.restype = c_int32 + lib.infiniopGetHardSwishWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + POINTER(c_size_t) # 输出:工作空间大小指针 + ] + + # 3. 执行hard_swish算子的函数 + lib.infiniopHardSwish.restype = c_int32 + lib.infiniopHardSwish.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + c_void_p, # 工作空间地址 + c_size_t, # 工作空间大小 + c_void_p, # 输出数据地址 + c_void_p, # 输入数据地址(hard_swish为单输入) + c_void_p # 额外参数(通常为nullptr) + ] + + # 4. 销毁hard_swish算子描述符的函数 + lib.infiniopDestroyHardSwishDescriptor.restype = c_int32 + lib.infiniopDestroyHardSwishDescriptor.argtypes = [ + infiniopOperatorDescriptor_t # 算子描述符 + ] + +@OpRegister.operator +def leaky_relu_(lib): + # 1. 创建 LeakyReLU 算子描述符的函数 + lib.infiniopCreateLeakyReluDescriptor.restype = c_int32 + lib.infiniopCreateLeakyReluDescriptor.argtypes = [ + infiniopHandle_t, # 句柄 + POINTER(infiniopOperatorDescriptor_t), # 输出:算子描述符指针 + infiniopTensorDescriptor_t, # 输出张量描述符 + infiniopTensorDescriptor_t, # 输入张量描述符 + ] + + # 2. 获取 LeakyReLU 算子工作空间大小的函数 + lib.infiniopGetLeakyReluWorkspaceSize.restype = c_int32 + lib.infiniopGetLeakyReluWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + POINTER(c_size_t) # 输出:工作空间大小指针 + ] + + # 3. 执行 LeakyReLU 算子的函数 + lib.infiniopLeakyRelu.restype = c_int32 + lib.infiniopLeakyRelu.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + c_void_p, # 工作空间地址 + c_size_t, # 工作空间大小 + c_void_p, # 输出数据地址 + c_void_p, # 输入数据地址 + c_float, # negative_slope 参数 + c_void_p # 额外参数(通常为 nullptr) + ] + + # 4. 销毁 LeakyReLU 算子描述符的函数 + lib.infiniopDestroyLeakyReluDescriptor.restype = c_int32 + lib.infiniopDestroyLeakyReluDescriptor.argtypes = [ + infiniopOperatorDescriptor_t # 算子描述符 + ] + + @OpRegister.operator def mul_(lib): lib.infiniopCreateMulDescriptor.restype = c_int32 @@ -387,6 +561,42 @@ def rope_(lib): ] + +@OpRegister.operator +def sigmoid_backward_(lib): + lib.infiniopCreateSigmoidBackwardDescriptor.restype = c_int32 + lib.infiniopCreateSigmoidBackwardDescriptor.argtypes = [ + infiniopHandle_t, # 句柄 + POINTER(infiniopOperatorDescriptor_t), # 输出:算子描述符指针 + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t + ] + + lib.infiniopGetSigmoidBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetSigmoidBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + POINTER(c_size_t) # 输出:工作空间大小指针 + ] + + lib.infiniopSigmoidBackward.restype = c_int32 + lib.infiniopSigmoidBackward.argtypes = [ + infiniopOperatorDescriptor_t, # 算子描述符 + c_void_p, # 工作空间地址 + c_size_t, # 工作空间大小 + c_void_p, # 输出数据地址 + c_void_p, # 输入数据地址 + c_void_p, # 输入数据地址 + c_void_p # 额外参数(通常为nullptr) + ] + + lib.infiniopDestroySigmoidBackwardDescriptor.restype = c_int32 + lib.infiniopDestroySigmoidBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t # 算子描述符 + ] + + + @OpRegister.operator def sub_(lib): lib.infiniopCreateSubDescriptor.restype = c_int32 @@ -489,3 +699,99 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + +@OpRegister.operator +def sin_(lib): + lib.infiniopCreateSinDescriptor.restype = c_int32 + lib.infiniopCreateSinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # output descriptor + infiniopTensorDescriptor_t, # input descriptor + ] + + lib.infiniopGetSinWorkspaceSize.restype = c_int32 + lib.infiniopGetSinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopSin.restype = c_int32 + lib.infiniopSin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, # output data + c_void_p, # input data + c_void_p, # stream or reserved + ] + + lib.infiniopDestroySinDescriptor.restype = c_int32 + lib.infiniopDestroySinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def tanh_(lib): + lib.infiniopCreateTanhDescriptor.restype = c_int32 + lib.infiniopCreateTanhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # output descriptor + infiniopTensorDescriptor_t, # input descriptor + ] + + lib.infiniopGetTanhWorkspaceSize.restype = c_int32 + lib.infiniopGetTanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopTanh.restype = c_int32 + lib.infiniopTanh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, # output data + c_void_p, # input data + c_void_p, # stream or reserved + ] + + lib.infiniopDestroyTanhDescriptor.restype = c_int32 + lib.infiniopDestroyTanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] +@OpRegister.operator +def where_(lib): + lib.infiniopCreateWhereDescriptor.restype = c_int32 + lib.infiniopCreateWhereDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetWhereWorkspaceSize.restype = c_int32 + lib.infiniopGetWhereWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopWhere.restype = c_int32 + lib.infiniopWhere.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyWhereDescriptor.restype = c_int32 + lib.infiniopDestroyWhereDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] \ No newline at end of file diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 5c8e7f80a..135720bc2 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -66,10 +66,53 @@ def __init__( torch_strides.append(strides[i]) else: torch_shape.append(shape[i]) + + torch_dtype = to_torch_dtype(dt) if mode == "random": - self._torch_tensor = torch.rand( - torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] - ) + if torch_dtype == torch.bool: + # 对于布尔类型,先生成0和1的整数张量,再转换为布尔类型 + self._torch_tensor = torch.randint(0, 2, torch_shape, + dtype=torch.int32, + device=torch_device_map[device]) + self._torch_tensor = self._torch_tensor.to(torch.bool) + elif torch_dtype in (torch.uint8, torch.uint16, torch.uint32, torch.uint64, torch.int8, torch.int16, torch.int32, torch.int64): + # 对于整数类型,使用randint + dtype_info = torch.iinfo(torch_dtype) + + # 计算一个安全的范围,避免溢出 + if dtype_info.min < 0: + # 有符号整数 + safe_range = min(10000, dtype_info.max // 2) + low = -safe_range + high = safe_range + 1 + else: + # 无符号整数 + if torch_dtype == torch.uint8: + # 对于8位无符号整数,可以使用全范围 + low = 0 + high = dtype_info.max + 1 + elif torch_dtype == torch.uint16: + # 对于16位无符号整数,使用较大的范围 + low = 0 + high = min(65536, dtype_info.max + 1) + else: + # 对于32位和64位无符号整数,使用较小的范围以避免溢出 + safe_range = min(10000, dtype_info.max // 2) + low = 0 + high = safe_range + 1 + + self._torch_tensor = torch.randint( + low=low, + high=high, + size=torch_shape, + dtype=torch_dtype, + device=torch_device_map[device] + ) + else: + # 对于浮点类型,使用rand + self._torch_tensor = torch.rand( + torch_shape, dtype=torch_dtype, device=torch_device_map[device] + ) elif mode == "zeros": self._torch_tensor = torch.zeros( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] @@ -87,14 +130,16 @@ def __init__( ) else: raise ValueError("Unsupported mode") - if scale is not None: self._torch_tensor *= scale if bias is not None: self._torch_tensor += bias if strides is not None: - self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides) + if not is_signed_integer_dtype(dt) and is_integer_dtype(dt): + self._data_tensor = rearrange_tensor_extend_uint(self._torch_tensor, torch_strides) + else: + self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides) else: self._data_tensor = self._torch_tensor.clone() @@ -121,6 +166,13 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum): ) +def is_integer_dtype(dt): + return dt in [InfiniDtype.I32, InfiniDtype.I64, InfiniDtype.U32, InfiniDtype.U64] + +def is_signed_integer_dtype(dt): + return dt in [InfiniDtype.I32, InfiniDtype.I64] + + def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): if dt == InfiniDtype.I8: return torch.int8 @@ -148,6 +200,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): return torch.int32 if compatability_mode else torch.uint32 elif dt == InfiniDtype.U64: return torch.int64 if compatability_mode else torch.uint64 + elif dt == InfiniDtype.BOOL: + return torch.bool else: raise ValueError("Unsupported data type") @@ -226,6 +280,78 @@ def rearrange_tensor(tensor, new_strides): return new_tensor + +def rearrange_tensor_extend_uint(tensor, new_strides): + """ + Rearranges the given tensor to have new strides by copying data into a new memory layout. + Supports all dtypes including torch.uint32. + """ + import torch + + shape = tensor.shape + ndim = len(shape) + + # 计算新张量所需的 flat size(内存空间大小) + left = 0 + right = 0 + for i in range(ndim): + if new_strides[i] > 0: + right += new_strides[i] * (shape[i] - 1) + else: + raise ValueError("Negative strides are not supported yet") + + # 新张量的 flat size(包含所有偏移的最大位置 +1) + flat_size = right + 1 + + # 创建一个扁平的新张量 + new_tensor = torch.zeros(flat_size, dtype=tensor.dtype, device=tensor.device) + + # 创建所有维度的坐标网格 + indices = [torch.arange(s, device=tensor.device) for s in shape] + mesh = torch.meshgrid(*indices, indexing="ij") + coords = [m.flatten() for m in mesh] # 每个维度展平成一维 + new_positions += offset + # 计算新位置的线性索引 offset = i0 * s0 + i1 * s1 + ... + # new_positions = sum(coords[i] * new_strides[i] for i in range(ndim)) + + # # 从原始张量中提取数据并写入新内存布局(避免 index_add_) + # flat_src = tensor.contiguous().view(-1) + # flat_dst = new_tensor.view(-1) + # for i in range(new_positions.numel()): + # flat_dst[new_positions[i]] = flat_src[i] + + # # 用新布局构造最终张量 + # new_tensor = new_tensor.as_strided(shape, new_strides) + unsupported_types = (torch.uint16, torch.uint32, torch.uint64) + + if tensor.dtype in unsupported_types: + # For unsupported types, convert to a compatible type, perform the operation, then convert back + # Determine the compatible type + if tensor.dtype == torch.uint16: + compatible_dtype = torch.int16 + elif tensor.dtype == torch.uint32: + compatible_dtype = torch.int32 + else: # torch.uint64 + compatible_dtype = torch.int64 + + # Convert tensors to compatible type + tensor_converted = tensor.to(compatible_dtype) + new_tensor_converted = new_tensor.to(compatible_dtype) + + # Perform the index_add_ operation + new_tensor_converted.view(-1).index_add_(0, new_positions, tensor_converted.view(-1)) + + # Convert the result back to the original dtype + new_tensor.copy_(new_tensor_converted.to(tensor.dtype)) + else: + # For supported types, use index_add_ directly + new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1)) + + new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides)) + + return new_tensor + + def get_args(): import argparse diff --git a/test/infiniop/sigmoid_backward.py b/test/infiniop/sigmoid_backward.py new file mode 100644 index 000000000..65e3d0316 --- /dev/null +++ b/test/infiniop/sigmoid_backward.py @@ -0,0 +1,174 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration +# ============================================================================== +# sigmoid_backward 是双输入算子:grad_output 和 input +# shape,grad_stride=None,x_stride=None,c_stride=None, +_TEST_CASES_ = [ + ((13, 4, 4), None, None, None), + ((13, 4, 4), (16, 4, 1), (16, 4, 1), (16, 4, 1)), + ((13, 4, 4), (4, 0, 1), (4, 0, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +_TENSOR_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.BF16 +] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sigmoid_backward_ref(input, grad_output): + input.requires_grad_(True) + output = torch.nn.functional.sigmoid(input) + output.backward(grad_output) + return input.grad + + + +def test( + handle, + device, + shape, + grad_stride=None, + x_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + # 输入 + grad_output = TestTensor(shape, grad_stride, dtype, device) + input_x = TestTensor(shape, x_stride, dtype, device) + + if inplace == Inplace.INPLACE_X: + if c_stride != grad_stride: + return + grad_input = grad_output + else: + grad_input = TestTensor(shape, c_stride, dtype, device, mode="ones") + + if grad_input.is_broadcast(): + return + + print( + f"Testing sigmoid_backward on {InfiniDeviceNames[device]} " + f"shape:{shape} grad_stride:{grad_stride} x_stride:{x_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # PyTorch参考输出 + grad_input_torch = sigmoid_backward_ref(input_x.torch_tensor(), grad_output.torch_tensor()) + grad_input.torch_tensor().copy_(grad_input_torch) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSigmoidBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input.descriptor, # 输出 grad_input + grad_output.descriptor, # 输入 grad_output + input_x.descriptor # 输入 x + ) + ) + + for tensor in [grad_output, input_x, grad_input]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSigmoidBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_input.device) + + def lib_sigmoid_backward(): + check_error( + LIBINFINIOP.infiniopSigmoidBackward( + descriptor, + workspace.data(), + workspace.size(), + grad_input.data(), + grad_output.data(), + input_x.data(), + None + ) + ) + + lib_sigmoid_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + profile_operation("PyTorch", lambda: sigmoid_backward_ref(input_x.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sigmoid_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroySigmoidBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sin-gguf.py b/test/infiniop/sin-gguf.py new file mode 100644 index 000000000..bfa74bc5d --- /dev/null +++ b/test/infiniop/sin-gguf.py @@ -0,0 +1,163 @@ +import torch +import ctypes +from ctypes import c_uint64 +from gguf import GGUFReader +from enum import Enum, auto + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +# 支持的数据类型 +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# PyTorch参考实现 +def sin(output, input): + if output.shape != input.shape: + output.resize_(input.shape) + torch.sin(input, out=output) + +# 从 gguf 文件加载测试用例 +def load_test_cases_from_gguf(filepath): + reader = GGUFReader(filepath) + tensors = reader.tensors + + test_cases = [] + for tensor in tensors: + data = tensor.data + shape = data.shape + torch_tensor = torch.from_numpy(data.copy()) + x_stride = torch_tensor.stride() + c_stride = None + + for inplace in [Inplace.OUT_OF_PLACE, Inplace.INPLACE_X]: + test_cases.append((shape, x_stride, c_stride, inplace, torch_tensor)) + + return test_cases + +def test( + handle, + device, + shape, + x_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + torch_tensor=None, + dtype=torch.float16, + sync=None, +): + x = TestTensor(shape, x_stride, dtype, device, mode="manual", set_tensor=torch_tensor) + if inplace == Inplace.INPLACE_X: + # if x_stride != c_stride: + # return + c = x + else: + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing sin on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} " + f"c_stride:{c_stride} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + sin(c.torch_tensor(), x.torch_tensor()) + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSinDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + x.descriptor + ) + ) + + for tensor in [x, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error(LIBINFINIOP.infiniopGetSinWorkspaceSize(descriptor, ctypes.byref(workspace_size))) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_sin(): + check_error( + LIBINFINIOP.infiniopSin( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + x.data(), + None + ) + ) + + lib_sin() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + actual = c.actual_tensor() + expected = c.torch_tensor() + if DEBUG: + debug(actual, expected, atol=atol, rtol=rtol) + + assert torch.allclose(actual, expected, atol=atol, rtol=rtol) + + if PROFILE: + profile_operation("PyTorch", lambda: sin(c.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lib_sin, device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor)) + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # gguf 文件路径示例,按实际情况修改 + _TEST_CASES = { + InfiniDtype.F16: load_test_cases_from_gguf("T1-1-1/sin/sin_bf16.gguf"), + InfiniDtype.F32: load_test_cases_from_gguf("T1-1-1/sin/sin_f32.gguf"), + InfiniDtype.BF16: load_test_cases_from_gguf("T1-1-1/sin/sin_bf16.gguf"), + } + + + for device in get_test_devices(args): + for dtype in _TEST_CASES: + test_operator(device, test, _TEST_CASES[dtype], [dtype]) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py new file mode 100644 index 000000000..ec447cc46 --- /dev/null +++ b/test/infiniop/sin.py @@ -0,0 +1,177 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F64: {"atol": 1e-11, "rtol": 1e-11}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sin(output, input): + if output.shape != input.shape: + # 如果形状不匹配,先调整输出张量的形状 + output.resize_(input.shape) + torch.sin(input, out=output) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input_tensor = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return + output = input_tensor + else: + # 确保输出张量的形状与输入张量的形状匹配 + output_shape = shape if output_stride is None else shape + output = TestTensor(output_shape, output_stride, dtype, device, mode="ones") + if output.is_broadcast(): + return + + print( + f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + sin(output.torch_tensor(), input_tensor.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSinDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_sin(): + check_error( + LIBINFINIOP.infiniopSin( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input_tensor.data(), + None, + ) + ) + + lib_sin() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sin(output.torch_tensor(), input_tensor.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py new file mode 100644 index 000000000..a08bfd9be --- /dev/null +++ b/test/infiniop/tanh.py @@ -0,0 +1,177 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +# 定义一个枚举类,用于表示是否进行原地操作 +class Inplace(Enum): + # 不进行原地操作 + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def tanh(output, input): + if output.shape != input.shape: + # 如果形状不匹配,先调整输出张量的形状 + output.resize_(input.shape) + torch.tanh(input, out=output) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input_tensor = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return + output = input_tensor + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + tanh(output.torch_tensor(), input_tensor.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTanhDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_tanh(): + check_error( + LIBINFINIOP.infiniopTanh( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input_tensor.data(), + None, + ) + ) + + lib_tanh() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input_tensor.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/test/infiniop/where.py b/test/infiniop/where.py new file mode 100644 index 000000000..398d085c9 --- /dev/null +++ b/test/infiniop/where.py @@ -0,0 +1,261 @@ +import torch +import ctypes +from ctypes import c_uint64 +from ctypes import c_uint8 +from ctypes import c_uint16 +from ctypes import c_uint32 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride, condition_stride + ((13, 4), None, None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None, None), + ((13, 4, 4), None, None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None, None), + ((16, 5632), None, None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing - support all legal types +_TENSOR_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, + InfiniDtype.BF16, + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + # InfiniDtype.U8, + # InfiniDtype.U16, + # InfiniDtype.U32, + # InfiniDtype.U64 + # InfiniDtype.F8 +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, + InfiniDtype.I16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + # InfiniDtype.U8: {"atol": 0, "rtol": 0}, + # InfiniDtype.U16: {"atol": 0, "rtol": 0}, + # InfiniDtype.U32: {"atol": 0, "rtol": 0}, + # InfiniDtype.U64: {"atol": 0, "rtol": 0} + # InfiniDtype.F8: {"atol": 1e-3, "rtol": 1e-3} +} + +DEBUG = True +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def where_op(c, a, b, condition): + """PyTorch reference implementation of where operation""" + torch.where(condition, a, b, out=c) +# def where_op(c, a, b, condition): +# """PyTorch reference implementation of where operation""" +# # 检查数据类型并进行必要的转换 +# unsupported_types = (torch.uint16, torch.uint32, torch.uint64) + +# if a.dtype in unsupported_types or b.dtype in unsupported_types or c.dtype in unsupported_types: +# # 将不支持的类型转换为对应的兼容类型 +# def get_compatible_dtype(dtype): +# if dtype == torch.uint16: +# return torch.int16 +# elif dtype == torch.uint32: +# return torch.int32 +# elif dtype == torch.uint64: +# return torch.int64 +# else: +# return dtype + +# a_converted = a.to(get_compatible_dtype(a.dtype)) if a.dtype in unsupported_types else a +# b_converted = b.to(get_compatible_dtype(b.dtype)) if b.dtype in unsupported_types else b +# c_converted = c.to(get_compatible_dtype(c.dtype)) if c.dtype in unsupported_types else c + +# # 调用torch.where +# torch.where(condition, a_converted, b_converted, out=c_converted) + +# # 如果需要,将结果转换回原始类型 +# if c.dtype in unsupported_types: +# c.copy_(c_converted.to(c.dtype)) +# else: +# # 对于其他支持的类型,直接调用torch.where +# torch.where(condition, a, b, out=c) + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + condition_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float32, + sync=None, +): + # Create tensors with specified data type + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + + # Create condition tensor (always bool type) + condition = TestTensor(shape, condition_stride, InfiniDtype.BOOL, device) + + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Where on {InfiniDeviceNames[device]} with shape:{shape} " + f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"condition_stride:{condition_stride} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # Execute PyTorch reference implementation + where_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(), condition.torch_tensor()) + + if sync is not None: + sync() + + # Create operator descriptor + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateWhereDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + condition.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c, condition]: + tensor.destroy_desc() + + # Get workspace size + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetWhereWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + + + def libwhere(): + check_error( + LIBINFINIOP.infiniopWhere( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + condition.data(), + None, + ) + ) + + libwhere() + + # Sync the torch_tensor with actual_tensor after Infiniop operation + # Copy data from actual_tensor to torch_tensor to ensure consistency + c.torch_tensor().copy_(c.actual_tensor()) + + # Verify results + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: where_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor(), condition.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: libwhere(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test_report/image-1.png b/test_report/image-1.png new file mode 100644 index 000000000..5abfbb56a Binary files /dev/null and b/test_report/image-1.png differ diff --git a/test_report/image-10.png b/test_report/image-10.png new file mode 100644 index 000000000..81192b080 Binary files /dev/null and b/test_report/image-10.png differ diff --git a/test_report/image-11.png b/test_report/image-11.png new file mode 100644 index 000000000..6596844cf Binary files /dev/null and b/test_report/image-11.png differ diff --git a/test_report/image-12.png b/test_report/image-12.png new file mode 100644 index 000000000..aebaf49c2 Binary files /dev/null and b/test_report/image-12.png differ diff --git a/test_report/image-13.png b/test_report/image-13.png new file mode 100644 index 000000000..de3094a0a Binary files /dev/null and b/test_report/image-13.png differ diff --git a/test_report/image-14.png b/test_report/image-14.png new file mode 100644 index 000000000..3da27acb0 Binary files /dev/null and b/test_report/image-14.png differ diff --git a/test_report/image-15.png b/test_report/image-15.png new file mode 100644 index 000000000..8b7daf6a9 Binary files /dev/null and b/test_report/image-15.png differ diff --git a/test_report/image-16.png b/test_report/image-16.png new file mode 100644 index 000000000..5208a0cce Binary files /dev/null and b/test_report/image-16.png differ diff --git a/test_report/image-17.png b/test_report/image-17.png new file mode 100644 index 000000000..3a79328b8 Binary files /dev/null and b/test_report/image-17.png differ diff --git a/test_report/image-18.png b/test_report/image-18.png new file mode 100644 index 000000000..3aa857fad Binary files /dev/null and b/test_report/image-18.png differ diff --git a/test_report/image-19.png b/test_report/image-19.png new file mode 100644 index 000000000..e2edccfc0 Binary files /dev/null and b/test_report/image-19.png differ diff --git a/test_report/image-2.png b/test_report/image-2.png new file mode 100644 index 000000000..1334e327e Binary files /dev/null and b/test_report/image-2.png differ diff --git a/test_report/image-20.png b/test_report/image-20.png new file mode 100644 index 000000000..dd4539187 Binary files /dev/null and b/test_report/image-20.png differ diff --git a/test_report/image-21.png b/test_report/image-21.png new file mode 100644 index 000000000..3689ec3d2 Binary files /dev/null and b/test_report/image-21.png differ diff --git a/test_report/image-22.png b/test_report/image-22.png new file mode 100644 index 000000000..de40c1f56 Binary files /dev/null and b/test_report/image-22.png differ diff --git a/test_report/image-23.png b/test_report/image-23.png new file mode 100644 index 000000000..6190f8485 Binary files /dev/null and b/test_report/image-23.png differ diff --git a/test_report/image-3.png b/test_report/image-3.png new file mode 100644 index 000000000..20ca6f20a Binary files /dev/null and b/test_report/image-3.png differ diff --git a/test_report/image-4.png b/test_report/image-4.png new file mode 100644 index 000000000..82191f76f Binary files /dev/null and b/test_report/image-4.png differ diff --git a/test_report/image-5.png b/test_report/image-5.png new file mode 100644 index 000000000..acf44f5f0 Binary files /dev/null and b/test_report/image-5.png differ diff --git a/test_report/image-6.png b/test_report/image-6.png new file mode 100644 index 000000000..88c7b5240 Binary files /dev/null and b/test_report/image-6.png differ diff --git a/test_report/image-7.png b/test_report/image-7.png new file mode 100644 index 000000000..c46b8c080 Binary files /dev/null and b/test_report/image-7.png differ diff --git a/test_report/image-8.png b/test_report/image-8.png new file mode 100644 index 000000000..4e9052000 Binary files /dev/null and b/test_report/image-8.png differ diff --git a/test_report/image-9.png b/test_report/image-9.png new file mode 100644 index 000000000..735fbb398 Binary files /dev/null and b/test_report/image-9.png differ diff --git a/test_report/image.png b/test_report/image.png new file mode 100644 index 000000000..5dbd8e962 Binary files /dev/null and b/test_report/image.png differ diff --git a/test_report/sin-cpu-1.png b/test_report/sin-cpu-1.png new file mode 100644 index 000000000..245eb802a Binary files /dev/null and b/test_report/sin-cpu-1.png differ diff --git a/test_report/sin-cpu-2.png b/test_report/sin-cpu-2.png new file mode 100644 index 000000000..23aae89c7 Binary files /dev/null and b/test_report/sin-cpu-2.png differ diff --git a/test_report/sin-metax-1.png b/test_report/sin-metax-1.png new file mode 100644 index 000000000..7ce558d5a Binary files /dev/null and b/test_report/sin-metax-1.png differ diff --git a/test_report/sin-metax-2.png b/test_report/sin-metax-2.png new file mode 100644 index 000000000..5a2ac409d Binary files /dev/null and b/test_report/sin-metax-2.png differ diff --git a/test_report/tanh-cpu-1.png b/test_report/tanh-cpu-1.png new file mode 100644 index 000000000..997d7fb6d Binary files /dev/null and b/test_report/tanh-cpu-1.png differ diff --git a/test_report/tanh-cpu-2.png b/test_report/tanh-cpu-2.png new file mode 100644 index 000000000..b79b15a5e Binary files /dev/null and b/test_report/tanh-cpu-2.png differ diff --git a/test_report/tanh-metax-1.png b/test_report/tanh-metax-1.png new file mode 100644 index 000000000..53bc08017 Binary files /dev/null and b/test_report/tanh-metax-1.png differ diff --git a/test_report/tanh-metax-2.png b/test_report/tanh-metax-2.png new file mode 100644 index 000000000..fc90aff12 Binary files /dev/null and b/test_report/tanh-metax-2.png differ diff --git a/test_report/where-cpu-1.png b/test_report/where-cpu-1.png new file mode 100644 index 000000000..35d5a30f3 Binary files /dev/null and b/test_report/where-cpu-1.png differ diff --git a/test_report/where-cpu-2.png b/test_report/where-cpu-2.png new file mode 100644 index 000000000..f01ac6788 Binary files /dev/null and b/test_report/where-cpu-2.png differ diff --git a/test_report/where-metax-1.png b/test_report/where-metax-1.png new file mode 100644 index 000000000..efbf91fdc Binary files /dev/null and b/test_report/where-metax-1.png differ diff --git a/test_report/where-metax-2.png b/test_report/where-metax-2.png new file mode 100644 index 000000000..377636197 Binary files /dev/null and b/test_report/where-metax-2.png differ diff --git "a/test_report/\346\265\213\350\257\225\346\212\245\345\221\212.md" "b/test_report/\346\265\213\350\257\225\346\212\245\345\221\212.md" new file mode 100644 index 000000000..ac3e4afb0 --- /dev/null +++ "b/test_report/\346\265\213\350\257\225\346\212\245\345\221\212.md" @@ -0,0 +1,161 @@ +完成了算子赛题 T1-1-1 : +- Exp +- Sin +- Cos +- LeakyRelu +- Tanh +- Sigmoid Backward +- HardSwish +- Cast +- Where + +在内的所有9个算子的实现,以及 T1-1-2的: + +…… + +并通过了CPU、Metax平台中Pytorch单元测试代码的验证。 + + +# PyTorch单元测试 +## 1. Exp算子测试 + +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持数据类型涵盖了f32, f16, bf16 + + +### CPU平台测试结果 +![alt text](image.png) +…… +![alt text](image-1.png) + +### Metax平台测试结果 +![alt text](image-2.png) +…… +![alt text](image-3.png) + +## 2. Sin算子测试 +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持数据类型涵盖了f32, f16, bf16 + +### CPU平台测试结果 + +![](sin-cpu-1.png) + +…… + +![](sin-cpu-2.png) + +### Metax平台测试结果 + +![](sin-metax-1.png) + +…… + +![](sin-metax-2.png) + +## 3. Cos算子测试 + +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持数据类型涵盖了f32, f16, bf16 +### CPU平台测试结果 +![alt text](image-6.png) +…… +![alt text](image-7.png) + +### Metax平台测试结果 +![alt text](image-4.png) +…… +![alt text](image-5.png) + +## 4. LeakyRelu算子测试 +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持数据类型涵盖了f32, f16, bf16 +### CPU平台测试结果 +![alt text](image-8.png) +…… +![alt text](image-9.png) +### Metax平台测试结果 +![alt text](image-10.png) +…… +![alt text](image-11.png) + +## 5. Tanh算子测试 +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持数据类型涵盖了f32, f16, bf16 + +### CPU平台测试结果 + +![](tanh-cpu-1.png) + +…… + +![](tanh-cpu-2.png) + +### Metax平台测试结果 + +![](tanh-metax-1.png) + +…… + +![](tanh-metax-2.png) + +## 6. Sigmoid Backward算子测试 +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持数据类型涵盖了f32, f16, bf16 +### CPU平台测试结果 +![alt text](image-12.png) +…… +![alt text](image-13.png) + +### Metax平台测试结果 +![alt text](image-14.png) +…… +![alt text](image-15.png) + +## 7. HardSwish算子测试 +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持数据类型涵盖了f32, f16, bf16 +### CPU平台测试结果 +![alt text](image-16.png) +…… +![alt text](image-17.png) +### Metax平台测试结果 +![alt text](image-18.png) +…… +![alt text](image-19.png) + +## 8. Cast算子测试 +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持整数类型 (int32, int64, uint32, uint64) 之间互转 +- 支持浮点类型 (f32, f16, f64) 之间互转 +- 支持整数类型 (int32, int64, uint32, uint64) 到浮点类型 (f32, f16, f64) 的互转 +### CPU平台测试结果 +![alt text](image-20.png) +…… +![alt text](image-21.png) +### Metax平台测试结果 +![alt text](image-22.png) +…… +![alt text](image-23.png) + +## 9. Where算子测试 + +- 测试能够覆盖多种输入输出的形状以及排布且已通过CPU、Metax平台上的PyTorch测试 +- 支持数据类型涵盖了f32, f16,f64, bf16,bool,I8,I16,I32,I64 + +### CPU平台测试结果 + +![](where-cpu-1.png) + +…… + +![](where-cpu-2.png) + +### Metax平台测试结果 + +![](where-metax-1.png) + +…… + +![](where-metax-2.png) +