diff --git a/include/infiniop.h b/include/infiniop.h index 0acad83f9..4a0773b32 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -4,21 +4,42 @@ #include "infiniop/handle.h" #include "infiniop/ops/add.h" #include "infiniop/ops/attention.h" +#include "infiniop/ops/averagepool.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" +#include "infiniop/ops/cross_entropy_loss.h" #include "infiniop/ops/dequantize.h" +#include "infiniop/ops/div.h" +#include "infiniop/ops/exp.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/hardswish.h" +#include "infiniop/ops/interpolate_nearest.h" +#include "infiniop/ops/logical_and.h" +#include "infiniop/ops/logical_or.h" +#include "infiniop/ops/maxpool.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" +#include "infiniop/ops/reduce_max.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" +#include "infiniop/ops/sin.h" #include "infiniop/ops/softplus.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" +#include "infiniop/ops/tanh.h" #include "infiniop/ops/topkrouter.h" +#include "infiniop/ops/where.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/reduce_max.h" +#include "infiniop/ops/layer_norm.h" +#include "infiniop/ops/index_copy_inplace.h" +#include "infiniop/ops/gather.h" +#include "infiniop/ops/scatter.h" +#include "infiniop/ops/batch_norm.h" +#include "infiniop/ops/equal.h" +#include "infiniop/ops/reduce_mean.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/averagepool.h b/include/infiniop/ops/averagepool.h new file mode 100644 index 000000000..87e857175 --- /dev/null +++ b/include/infiniop/ops/averagepool.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_AVERAGEPOOL_H__ +#define __INFINIOP_AVERAGEPOOL_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t; + +__C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle, + infiniopAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode); + +__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc); + +#endif // __INFINIOP_AVERAGEPOOL_H__ diff --git a/include/infiniop/ops/batch_norm.h b/include/infiniop/ops/batch_norm.h new file mode 100644 index 000000000..5487a1f69 --- /dev/null +++ b/include/infiniop/ops/batch_norm.h @@ -0,0 +1,37 @@ +#ifndef __INFINIOP_BATCH_NORM_API_H__ +#define __INFINIOP_BATCH_NORM_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopBatchNormDescriptor_t; + +__C __export infiniStatus_t infiniopCreateBatchNormDescriptor( + infiniopHandle_t handle, + infiniopBatchNormDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps +); + +__C __export infiniStatus_t infiniopGetBatchNormWorkspaceSize(infiniopBatchNormDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopBatchNorm(infiniopBatchNormDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + void * running_mean, + void * running_var, + const void * input, + const void * weight, + const void * bias, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyBatchNormDescriptor(infiniopBatchNormDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cross_entropy_loss.h b/include/infiniop/ops/cross_entropy_loss.h new file mode 100644 index 000000000..8b59843c9 --- /dev/null +++ b/include/infiniop/ops/cross_entropy_loss.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ +#define __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCrossEntropyLossDescriptor_t; + +__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle, + infiniopCrossEntropyLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t loss_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc); + +__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *loss, + const void *logits, + const void *target, + void *stream); + +__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc); + +#endif // __INFINIOP_CROSS_ENTROPY_LOSS_API_H__ diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h new file mode 100644 index 000000000..e539b440c --- /dev/null +++ b/include/infiniop/ops/div.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_DIV_API_H__ +#define __INFINIOP_DIV_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopDivDescriptor_t; + +__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h new file mode 100644 index 000000000..36a81984a --- /dev/null +++ b/include/infiniop/ops/equal.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_EQUAL_API_H__ +#define __INFINIOP_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t; + +__C __export infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +); + +__C __export infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h new file mode 100644 index 000000000..624bc5363 --- /dev/null +++ b/include/infiniop/ops/exp.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_EXP_API_H__ +#define __INFINIOP_EXP_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopExpDescriptor_t; + +__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/gather.h b/include/infiniop/ops/gather.h new file mode 100644 index 000000000..9ffe310c9 --- /dev/null +++ b/include/infiniop/ops/gather.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_GATHER_API_H__ +#define __INFINIOP_GATHER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGatherDescriptor_t; + +__C __export infiniStatus_t infiniopCreateGatherDescriptor( + infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopGather( + infiniopGatherDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h new file mode 100644 index 000000000..8d655fe82 --- /dev/null +++ b/include/infiniop/ops/hardswish.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_HARDSWISH_API_H__ +#define __INFINIOP_HARDSWISH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle, + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/index_copy_inplace.h b/include/infiniop/ops/index_copy_inplace.h new file mode 100644 index 000000000..e2266299a --- /dev/null +++ b/include/infiniop/ops/index_copy_inplace.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_INDEX_COPY_INPLACE_API_H__ +#define __INFINIOP_INDEX_COPY_INPLACE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t; + +__C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( + infiniopHandle_t handle, + infiniopIndexCopyInplaceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopIndexCopyInplace(infiniopIndexCopyInplaceDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/interpolate_nearest.h b/include/infiniop/ops/interpolate_nearest.h new file mode 100644 index 000000000..7f970dc38 --- /dev/null +++ b/include/infiniop/ops/interpolate_nearest.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_INTERPOLATE_NEAREST_H__ +#define __INFINIOP_INTERPOLATE_NEAREST_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopInterpolateNearestDescriptor_t; + +__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle, + infiniopInterpolateNearestDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc); + +__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc); + +#endif // __INFINIOP_INTERPOLATE_NEAREST_H__ diff --git a/include/infiniop/ops/layer_norm.h b/include/infiniop/ops/layer_norm.h new file mode 100644 index 000000000..e1d745723 --- /dev/null +++ b/include/infiniop/ops/layer_norm.h @@ -0,0 +1,36 @@ +#ifndef __INFINIOP_LAYER_NORM_API_H__ +#define __INFINIOP_LAYER_NORM_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLayerNormDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLayerNormDescriptor( + infiniopHandle_t handle, + infiniopLayerNormDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_standardization_desc, + infiniopTensorDescriptor_t input_std_deviation_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float eps +); + +__C __export infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLayerNorm(infiniopLayerNormDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + void * input_standardization, + void * input_std_deviation, + const void * input, + const void * weight, + const void * bias, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/logical_and.h b/include/infiniop/ops/logical_and.h new file mode 100644 index 000000000..5c237f79c --- /dev/null +++ b/include/infiniop/ops/logical_and.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_LOGICAL_AND_API_H__ +#define __INFINIOP_LOGICAL_AND_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalAndDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalAndDescriptor(infiniopHandle_t handle, + infiniopLogicalAndDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalAnd(infiniopLogicalAndDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/logical_or.h b/include/infiniop/ops/logical_or.h new file mode 100644 index 000000000..1c0066139 --- /dev/null +++ b/include/infiniop/ops/logical_or.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_LOGICAL_OR_API_H__ +#define __INFINIOP_LOGICAL_OR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalOrDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalOrDescriptor(infiniopHandle_t handle, + infiniopLogicalOrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalOr(infiniopLogicalOrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/maxpool.h b/include/infiniop/ops/maxpool.h new file mode 100644 index 000000000..e47a43aed --- /dev/null +++ b/include/infiniop/ops/maxpool.h @@ -0,0 +1,29 @@ +#ifndef __INFINIOP_MAX_POOL_H__ +#define __INFINIOP_MAX_POOL_H__ + +#include "../operator_descriptor.h" + +__C typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t; + +__C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle, + infiniopMaxPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode); + +__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc, + size_t *size); + +__C infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc); + +#endif // __INFINIOP_MAX_POOL_H__ diff --git a/include/infiniop/ops/reduce_max.h b/include/infiniop/ops/reduce_max.h new file mode 100644 index 000000000..42a3dd62d --- /dev/null +++ b/include/infiniop/ops/reduce_max.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_REDUCE_MAX_API_H__ +#define __INFINIOP_REDUCE_MAX_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReduceMaxDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReduceMaxDescriptor( + infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + size_t dim); + +__C __export infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReduceMax( + infiniopReduceMaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/reduce_mean.h b/include/infiniop/ops/reduce_mean.h new file mode 100644 index 000000000..5efd8b227 --- /dev/null +++ b/include/infiniop/ops/reduce_mean.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_REDUCE_MEAN_API_H__ +#define __INFINIOP_REDUCE_MEAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReduceMeanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReduceMeanDescriptor( + infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim); + +__C __export infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReduceMean( + infiniopReduceMeanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/scatter.h b/include/infiniop/ops/scatter.h new file mode 100644 index 000000000..22e0eff83 --- /dev/null +++ b/include/infiniop/ops/scatter.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_SCATTER_API_H__ +#define __INFINIOP_SCATTER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopScatterDescriptor_t; + +__C __export infiniStatus_t infiniopCreateScatterDescriptor( + infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h new file mode 100644 index 000000000..640deccc0 --- /dev/null +++ b/include/infiniop/ops/sin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SIN_API_H__ +#define __INFINIOP_SIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h new file mode 100644 index 000000000..742dba860 --- /dev/null +++ b/include/infiniop/ops/tanh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TANH_API_H__ +#define __INFINIOP_TANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h new file mode 100644 index 000000000..713db102f --- /dev/null +++ b/include/infiniop/ops/where.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_WHERE_API_H__ +#define __INFINIOP_WHERE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t; + +__C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + infiniopTensorDescriptor_t condition); + +__C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream); + +__C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index 5348c8c69..de4cb268a 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -25,6 +25,11 @@ def run_tests(args): "sub.py", "swiglu.py", "softplus.py", + "where.py", + "hardswish.py", + "tanh.py", + "sin.py", + "exp.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..ef6ba8c39 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -15,7 +15,25 @@ DECLARE_INFINIOP_TEST(swiglu) DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) +DECLARE_INFINIOP_TEST(div) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(where) +DECLARE_INFINIOP_TEST(hardswish) +DECLARE_INFINIOP_TEST(interpolate_nearest) +DECLARE_INFINIOP_TEST(maxpool) +DECLARE_INFINIOP_TEST(tanh) +DECLARE_INFINIOP_TEST(sin) +DECLARE_INFINIOP_TEST(exp) +DECLARE_INFINIOP_TEST(averagepool) +DECLARE_INFINIOP_TEST(cross_entropy_loss) +DECLARE_INFINIOP_TEST(logical_and) +DECLARE_INFINIOP_TEST(logical_or) +DECLARE_INFINIOP_TEST(layer_norm) +DECLARE_INFINIOP_TEST(index_copy_inplace) +DECLARE_INFINIOP_TEST(gather) +DECLARE_INFINIOP_TEST(scatter) +DECLARE_INFINIOP_TEST(batch_norm) +DECLARE_INFINIOP_TEST(equal) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -30,19 +48,32 @@ DECLARE_INFINIOP_TEST(sub) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(averagepool) \ + REGISTER_INFINIOP_TEST(cross_entropy_loss) \ + REGISTER_INFINIOP_TEST(maxpool) \ + REGISTER_INFINIOP_TEST(interpolate_nearest) \ + REGISTER_INFINIOP_TEST(logical_and) \ + REGISTER_INFINIOP_TEST(logical_or) \ + REGISTER_INFINIOP_TEST(layer_norm) \ + REGISTER_INFINIOP_TEST(index_copy_inplace) \ + REGISTER_INFINIOP_TEST(gather) \ + REGISTER_INFINIOP_TEST(scatter) \ + REGISTER_INFINIOP_TEST(batch_norm) \ + REGISTER_INFINIOP_TEST(equal) \ + REGISTER_INFINIOP_TEST(div) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/averagepool.cpp b/src/infiniop-test/src/ops/averagepool.cpp new file mode 100644 index 000000000..4f6a80201 --- /dev/null +++ b/src/infiniop-test/src/ops/averagepool.cpp @@ -0,0 +1,265 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::averagepool { + +struct Test::Attributes { + // 输入与期望输出 + std::shared_ptr input; + std::shared_ptr expected_output; + + // 平均池化参数 + std::vector kernel_size; + std::vector stride; + std::vector padding; + bool ceil_mode; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->expected_output = tensors["output"]; + + // (N, C, spatial...) → 池化维度数 = rank - 2 + size_t pool_ndim = test->_attributes->input->shape().size() - 2; + if (pool_ndim == 0) { + throw std::runtime_error( + "Input tensor must have at least 3 dimensions (N, C, ...)"); + } + + // ---- 解析并广播 kernel_size ---- + auto kernel_size_data = attributes["kernel_size"]; + if (kernel_size_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid kernel_size data size"); + } + size_t kernel_size_count = kernel_size_data.size() / sizeof(int); + const int *kernel_size_ptr = reinterpret_cast(kernel_size_data.data()); + + if (kernel_size_count == pool_ndim) { + test->_attributes->kernel_size.clear(); + for (size_t i = 0; i < kernel_size_count; ++i) { + test->_attributes->kernel_size.push_back( + static_cast(kernel_size_ptr[i])); + } + } else { + test->_attributes->kernel_size.assign( + pool_ndim, static_cast(kernel_size_ptr[0])); + } + + // ---- 解析并广播 stride ---- + auto stride_data = attributes["stride"]; + if (stride_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid stride data size"); + } + size_t stride_count = stride_data.size() / sizeof(int); + const int *stride_ptr = reinterpret_cast(stride_data.data()); + + if (stride_count == pool_ndim) { + test->_attributes->stride.clear(); + for (size_t i = 0; i < stride_count; ++i) { + test->_attributes->stride.push_back( + static_cast(stride_ptr[i])); + } + } else { + test->_attributes->stride.assign( + pool_ndim, static_cast(stride_ptr[0])); + } + + // ---- 解析并广播 padding ---- + auto padding_data = attributes["padding"]; + if (padding_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid padding data size"); + } + size_t padding_count = padding_data.size() / sizeof(int); + const int *padding_ptr = reinterpret_cast(padding_data.data()); + + if (padding_count == pool_ndim) { + test->_attributes->padding.clear(); + for (size_t i = 0; i < padding_count; ++i) { + test->_attributes->padding.push_back( + static_cast(padding_ptr[i])); + } + } else { + test->_attributes->padding.assign( + pool_ndim, static_cast(padding_ptr[0])); + } + + // ---- 解析 ceil_mode ---- + auto ceil_mode_data = attributes["ceil_mode"]; + if (ceil_mode_data.size() == sizeof(bool)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()); + } else if (ceil_mode_data.size() == sizeof(uint8_t)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()) != 0; + } else { + throw std::runtime_error("Invalid ceil_mode data size"); + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopAvgPoolDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto expected_output = _attributes->expected_output; + + auto input_dtype = input->ggml_type(); + auto output_shape = expected_output->shape(); + + size_t output_size_bytes = 1; + for (auto d : output_shape) { + output_size_bytes *= d; + } + output_size_bytes *= ggmlTypeSize(input_dtype); + + auto output_memory = std::make_shared(output_size_bytes, device, device_id); + + std::vector output_strides(output_shape.size()); + if (!output_shape.empty()) { + output_strides[output_shape.size() - 1] = 1; + for (int i = static_cast(output_shape.size()) - 2; i >= 0; --i) { + output_strides[i] = output_strides[i + 1] * output_shape[i + 1]; + } + } + + auto actual_output = std::make_shared( + output_memory, 0, output_shape, output_strides, input_dtype); + + // 参数指针(按底层接口需要传 void*) + void *kernel_size_ptr = _attributes->kernel_size.data(); + void *stride_ptr = _attributes->stride.data(); + void *padding_ptr = _attributes->padding.data(); + + // ---- 创建算子描述符 ---- + CHECK_OR(infiniopCreateAvgPoolDescriptor( + handle, &op_desc, + actual_output->desc(), + input->desc(), + kernel_size_ptr, + stride_ptr, + padding_ptr, + _attributes->ceil_mode), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to create avgpool descriptor.")); + + // ---- 获取工作空间大小 ---- + size_t workspace_size = 0; + CHECK_OR(infiniopGetAvgPoolWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to get workspace size.")); + + // ---- 分配工作空间(如需要)---- + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, + "Failed to allocate workspace.")); + } + + // ---- 执行平均池化 ---- + CHECK_OR(infiniopAvgPool( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + /*stream*/ nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, + "Failed during avgpool execution.")); + + // ---- 精度校验 ---- + try { + allClose(actual_output, expected_output, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyAvgPoolDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // ---- 性能测试 ---- + double elapsed_time = benchmark( + [=]() { + infiniopAvgPool( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + // ---- 清理资源 ---- + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyAvgPoolDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"kernel_size", "stride", "padding", "ceil_mode"}; +} + +std::vector Test::tensor_names() { + return {"input", "output"}; +} + +std::vector Test::output_names() { + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- expected_output: " << _attributes->expected_output->info() << std::endl; + + oss << "- kernel_size: ["; + for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) { + if (i) { + oss << ", "; + } + oss << _attributes->kernel_size[i]; + } + oss << "]\n- stride: ["; + for (size_t i = 0; i < _attributes->stride.size(); ++i) { + if (i) { + oss << ", "; + } + oss << _attributes->stride[i]; + } + oss << "]\n- padding: ["; + for (size_t i = 0; i < _attributes->padding.size(); ++i) { + if (i) { + oss << ", "; + } + oss << _attributes->padding[i]; + } + oss << "]\n- ceil_mode: " + << (_attributes->ceil_mode ? "true" : "false") << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::averagepool diff --git a/src/infiniop-test/src/ops/batch_norm.cpp b/src/infiniop-test/src/ops/batch_norm.cpp new file mode 100644 index 000000000..a44fee0f0 --- /dev/null +++ b/src/infiniop-test/src/ops/batch_norm.cpp @@ -0,0 +1,151 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::batch_norm { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr running_mean; + std::shared_ptr running_var; + std::shared_ptr input; + std::shared_ptr weight; + std::shared_ptr bias; + float momentum; + float eps; + std::shared_ptr ans_output; + std::shared_ptr ans_running_mean; + std::shared_ptr ans_running_var; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("running_mean") == tensors.end() + || tensors.find("running_var") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("weight") == tensors.end() + || tensors.find("bias") == tensors.end() + || tensors.find("ans_output") == tensors.end() + || tensors.find("ans_running_mean") == tensors.end() + || tensors.find("ans_running_var") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->running_mean = tensors["running_mean"]; + test->_attributes->running_var = tensors["running_var"]; + test->_attributes->input = tensors["input"]; + test->_attributes->weight = tensors["weight"]; + test->_attributes->bias = tensors["bias"]; + test->_attributes->ans_output = tensors["ans_output"]; + test->_attributes->ans_running_mean = tensors["ans_running_mean"]; + test->_attributes->ans_running_var = tensors["ans_running_var"]; + test->_attributes->momentum = *reinterpret_cast(attributes["momentum"].data()); + test->_attributes->eps = *reinterpret_cast(attributes["eps"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopBatchNormDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto running_mean = _attributes->running_mean->to(device, device_id); + auto running_var = _attributes->running_var->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto weight = _attributes->weight->to(device, device_id); + auto bias = _attributes->bias->to(device, device_id); + auto momentum = _attributes->momentum; + auto eps = _attributes->eps; + CHECK_OR(infiniopCreateBatchNormDescriptor(handle, &op_desc, + output->desc(), + running_mean->desc(), + running_var->desc(), + input->desc(), + weight->desc(), + bias->desc(), + momentum, + eps + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetBatchNormWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopBatchNorm(op_desc, workspace, workspace_size, + output->data(), + running_mean->data(), + running_var->data(), + input->data(), + weight->data(), + bias->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans_output, _rtol, _atol); + allClose(running_mean, _attributes->ans_running_mean, _rtol, _atol); + allClose(running_var, _attributes->ans_running_var, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopBatchNorm( + op_desc, workspace, workspace_size, + output->data(), + running_mean->data(), + running_var->data(), + input->data(), + weight->data(), + bias->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"momentum", "eps"}; +} + +std::vector Test::tensor_names() { + return {"output", "running_mean", "running_var", "input", "weight", "bias", "ans_output", "ans_running_mean", "ans_running_var"}; +} + +std::vector Test::output_names() { + return {"output", "running_mean", "running_var"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- running_mean: " << _attributes->running_mean->info() << std::endl; + oss << "- running_var: " << _attributes->running_var->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- weight: " << _attributes->weight->info() << std::endl; + oss << "- bias: " << _attributes->bias->info() << std::endl; + oss << "- momentum: " << _attributes->momentum << std::endl; + oss << "- eps: " << _attributes->eps << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::batch_norm diff --git a/src/infiniop-test/src/ops/cross_entropy_loss.cpp b/src/infiniop-test/src/ops/cross_entropy_loss.cpp new file mode 100644 index 000000000..7fac231e0 --- /dev/null +++ b/src/infiniop-test/src/ops/cross_entropy_loss.cpp @@ -0,0 +1,156 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cross_entropy_loss { + +struct Test::Attributes { + // 输入张量 + std::shared_ptr logits; + std::shared_ptr target; + std::shared_ptr loss; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + // 检查必需的张量是否存在 + if (!check_names(tensors, Test::tensor_names()) || !check_names(attributes, Test::attribute_names())) { + throw std::runtime_error("Invalid Test: Missing required tensors."); + } + + test->_attributes->logits = tensors["logits"]; + test->_attributes->target = tensors["target"]; + test->_attributes->loss = tensors["loss"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopCrossEntropyLossDescriptor_t op_desc; + + // 将输入张量移动到目标设备 + auto logits = _attributes->logits->to(device, device_id); + auto target = _attributes->target->to(device, device_id); + auto loss = _attributes->loss; + + // 根据期望输出的形状创建实际输出张量 + auto output_shape = loss->shape(); + size_t output_size = 1; + for (auto dim : output_shape) { + output_size *= dim; + } + output_size *= ggmlTypeSize(logits->ggml_type()); + + auto output_memory = std::make_shared(output_size, device, device_id); + std::vector output_strides(static_cast(output_shape.size())); + if (output_shape.size() > 0) { + output_strides[output_shape.size() - 1] = 1; + for (int i = static_cast(output_shape.size()) - 2; i >= 0; i--) { + output_strides[i] = output_strides[i + 1] * output_shape[i + 1]; + } + } + auto actual_output = std::make_shared( + output_memory, 0, output_shape, output_strides, logits->ggml_type()); + + // 1. 创建算子描述符 + CHECK_OR(infiniopCreateCrossEntropyLossDescriptor( + handle, &op_desc, + actual_output->desc(), + logits->desc(), + target->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cross entropy loss descriptor.")); + + // 2. 获取并分配工作空间 + size_t workspace_size; + CHECK_OR(infiniopGetCrossEntropyLossWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + } + + // 3. 执行计算 + CHECK_OR(infiniopCrossEntropyLoss( + op_desc, workspace, workspace_size, + actual_output->data(), + logits->data(), + target->data(), + nullptr), // stream + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during cross entropy loss execution.")); + + // 4. 验证结果 + try { + allClose(actual_output, loss, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyCrossEntropyLossDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 5. 性能测试 + double elapsed_time = benchmark( + [=]() { + infiniopCrossEntropyLoss( + op_desc, workspace, workspace_size, + actual_output->data(), + logits->data(), + target->data(), + nullptr); // stream + }, + warm_ups, iterations); + + // 6. 清理资源 + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyCrossEntropyLossDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +// 定义算子需要的属性名列表 +std::vector Test::attribute_names() { + return {}; // CrossEntropyLoss 没有额外的属性 +} + +// 定义算子需要的张量名列表 +std::vector Test::tensor_names() { + return {"logits", "target", "loss"}; +} + +std::vector Test::output_names() { + return {}; +} + +// 打印测试信息的辅助函数 +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- logits: " << _attributes->logits->info() << std::endl; + oss << "- target: " << _attributes->target->info() << std::endl; + oss << "- loss: " << _attributes->loss->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::cross_entropy_loss diff --git a/src/infiniop-test/src/ops/div.cpp b/src/infiniop-test/src/ops/div.cpp new file mode 100644 index 000000000..c1f49bda6 --- /dev/null +++ b/src/infiniop-test/src/ops/div.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::div { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopDivDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateDivDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetDivWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopDiv(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopDiv( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::div diff --git a/src/infiniop-test/src/ops/equal.cpp b/src/infiniop-test/src/ops/equal.cpp new file mode 100644 index 000000000..25bad7014 --- /dev/null +++ b/src/infiniop-test/src/ops/equal.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::equal { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopEqualDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateEqualDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetEqualWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopEqual(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopEqual( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::equal diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp new file mode 100644 index 000000000..070f8ef6b --- /dev/null +++ b/src/infiniop-test/src/ops/exp.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::exp { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopExpDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateExpDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetExpWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopExp(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopExp( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::exp diff --git a/src/infiniop-test/src/ops/gather.cpp b/src/infiniop-test/src/ops/gather.cpp new file mode 100644 index 000000000..e1b998fe9 --- /dev/null +++ b/src/infiniop-test/src/ops/gather.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::gather { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr input; + std::shared_ptr index; + size_t dim; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("index") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->index = tensors["index"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->dim = *reinterpret_cast(attributes["dim"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopGatherDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto index = _attributes->index->to(device, device_id); + auto dim = _attributes->dim; + CHECK_OR(infiniopCreateGatherDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + index->desc(), + dim + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetGatherWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopGather(op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopGather( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"output", "input", "index", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- index: " << _attributes->index->info() << std::endl; + oss << "- dim: " << _attributes->dim << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::gather diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp new file mode 100644 index 000000000..0ccf4f52a --- /dev/null +++ b/src/infiniop-test/src/ops/hardswish.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::hardswish { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopHardswishDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopHardswish( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::hardswish diff --git a/src/infiniop-test/src/ops/index_copy_inplace.cpp b/src/infiniop-test/src/ops/index_copy_inplace.cpp new file mode 100644 index 000000000..9f5fb5be9 --- /dev/null +++ b/src/infiniop-test/src/ops/index_copy_inplace.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::index_copy_inplace { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr input; + std::shared_ptr index; + size_t dim; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("index") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->index = tensors["index"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->dim = *reinterpret_cast(attributes["dim"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopIndexCopyInplaceDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto index = _attributes->index->to(device, device_id); + auto dim = _attributes->dim; + CHECK_OR(infiniopCreateIndexCopyInplaceDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + index->desc(), + dim + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetIndexCopyInplaceWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopIndexCopyInplace(op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopIndexCopyInplace( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"output", "input", "index", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- index: " << _attributes->index->info() << std::endl; + oss << "- dim: " << _attributes->dim << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::index_copy_inplace diff --git a/src/infiniop-test/src/ops/interpolate_nearest.cpp b/src/infiniop-test/src/ops/interpolate_nearest.cpp new file mode 100644 index 000000000..071527249 --- /dev/null +++ b/src/infiniop-test/src/ops/interpolate_nearest.cpp @@ -0,0 +1,151 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::interpolate_nearest { + +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr expected_output; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + std::cout << "DEBUG: Name check failed" << std::endl; + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; // F32 输入数据 + test->_attributes->expected_output = tensors["output"]; // F64 期望结果 + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopInterpolateNearestDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto expected_output = _attributes->expected_output; // F64 期望结果 + + // 动态创建实际的输出张量,使用期望结果的形状,但使用输入的数据类型 + auto output_shape = expected_output->shape(); + auto input_dtype = input->ggml_type(); + + // 创建输出张量的内存 + size_t output_size = 1; + for (auto dim : output_shape) { + output_size *= dim; + } + output_size *= ggmlTypeSize(input_dtype); + + auto output_memory = std::make_shared(output_size, device, device_id); + std::vector output_strides(output_shape.size()); + + // 计算连续的步长 + if (output_shape.size() > 0) { + output_strides[output_shape.size() - 1] = 1; + for (int i = static_cast(output_shape.size()) - 2; i >= 0; i--) { + output_strides[i] = output_strides[i + 1] * output_shape[i + 1]; + } + } + + auto actual_output = std::make_shared( + output_memory, 0, output_shape, output_strides, input_dtype); + + // Create operator descriptor + CHECK_OR(infiniopCreateInterpolateNearestDescriptor( + handle, &op_desc, + actual_output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + + // Get workspace size + size_t workspace_size; + CHECK_OR(infiniopGetInterpolateNearestWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + // Allocate workspace if needed + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + } + + // Execute interpolate nearest + CHECK_OR(infiniopInterpolateNearest( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + // Verify result - 比较实际输出和期望结果 + try { + allClose(actual_output, expected_output, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyInterpolateNearestDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // Benchmark + double elapsed_time = benchmark( + [=]() { + infiniopInterpolateNearest( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + // Cleanup + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyInterpolateNearestDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output"}; +} + +std::vector Test::output_names() { + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- expected_output: " << _attributes->expected_output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::interpolate_nearest diff --git a/src/infiniop-test/src/ops/layer_norm.cpp b/src/infiniop-test/src/ops/layer_norm.cpp new file mode 100644 index 000000000..b2dfa3e1b --- /dev/null +++ b/src/infiniop-test/src/ops/layer_norm.cpp @@ -0,0 +1,147 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::layer_norm { +struct Test::Attributes { + bool bias_exist; + std::shared_ptr output; + std::shared_ptr input_standardization; + std::shared_ptr input_std_deviation; + std::shared_ptr input; + std::shared_ptr weight; + std::shared_ptr bias; + float eps; + std::shared_ptr ans_output; + std::shared_ptr ans_input_standardization; + std::shared_ptr ans_input_std_deviation; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input_standardization") == tensors.end() + || tensors.find("input_std_deviation") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("weight") == tensors.end() + || tensors.find("bias") == tensors.end() + || tensors.find("ans_output") == tensors.end() + || tensors.find("ans_input_standardization") == tensors.end() + || tensors.find("ans_input_std_deviation") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input_standardization = tensors["input_standardization"]; + test->_attributes->input_std_deviation = tensors["input_std_deviation"]; + test->_attributes->input = tensors["input"]; + test->_attributes->weight = tensors["weight"]; + test->_attributes->bias = tensors["bias"]; + test->_attributes->ans_output = tensors["ans_output"]; + test->_attributes->ans_input_standardization = tensors["ans_input_standardization"]; + test->_attributes->ans_input_std_deviation = tensors["ans_input_std_deviation"]; + test->_attributes->eps = *reinterpret_cast(attributes["eps"].data()); + test->_attributes->bias_exist = *reinterpret_cast(attributes["bias_exist"].data()); + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLayerNormDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input_standardization = _attributes->input_standardization->to(device, device_id); + auto input_std_deviation = _attributes->input_std_deviation->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto weight = _attributes->weight->to(device, device_id); + auto bias = _attributes->bias->to(device, device_id); + auto eps = _attributes->eps; + CHECK_OR(infiniopCreateLayerNormDescriptor(handle, &op_desc, + output->desc(), + input_standardization->desc(), + input_std_deviation->desc(), + input->desc(), + weight->desc(), + (_attributes->bias_exist) ? bias->desc() : nullptr, + eps + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLayerNormWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLayerNorm(op_desc, workspace, workspace_size, + output->data(), + input_standardization->data(), + input_std_deviation->data(), + input->data(), + weight->data(), + (_attributes->bias_exist) ? bias->data() : nullptr, + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans_output, _rtol, _atol); + allClose(input_standardization, _attributes->ans_input_standardization, _rtol, _atol); + allClose(input_std_deviation, _attributes->ans_input_std_deviation, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLayerNorm( + op_desc, workspace, workspace_size, + output->data(), + input_standardization->data(), + input_std_deviation->data(), + input->data(), + weight->data(), + (_attributes->bias_exist) ? bias->data() : nullptr, + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"bias_exist", "eps"}; +} + +std::vector Test::tensor_names() { + return {"output", "input_standardization", "input_std_deviation", "input", "weight", "bias", "ans_output", "ans_input_standardization", "ans_input_std_deviation"}; +} + +std::vector Test::output_names() { + return {"output", "input_standardization", "input_std_deviation"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input_standardization: " << _attributes->input_standardization->info() << std::endl; + oss << "- input_std_deviation: " << _attributes->input_std_deviation->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- weight: " << _attributes->weight->info() << std::endl; + oss << "- bias: " << (_attributes->bias_exist ? _attributes->bias->info() : "null") << std::endl; + oss << "- eps: " << _attributes->eps << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::layer_norm diff --git a/src/infiniop-test/src/ops/logical_and.cpp b/src/infiniop-test/src/ops/logical_and.cpp new file mode 100644 index 000000000..152a3027d --- /dev/null +++ b/src/infiniop-test/src/ops/logical_and.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::logical_and { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLogicalAndDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateLogicalAndDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLogicalAndWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLogicalAnd(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLogicalAnd( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::logical_and diff --git a/src/infiniop-test/src/ops/logical_or.cpp b/src/infiniop-test/src/ops/logical_or.cpp new file mode 100644 index 000000000..8f7a261d5 --- /dev/null +++ b/src/infiniop-test/src/ops/logical_or.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::logical_or { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLogicalOrDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateLogicalOrDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLogicalOrWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLogicalOr(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLogicalOr( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::logical_or diff --git a/src/infiniop-test/src/ops/maxpool.cpp b/src/infiniop-test/src/ops/maxpool.cpp new file mode 100644 index 000000000..698c5ad89 --- /dev/null +++ b/src/infiniop-test/src/ops/maxpool.cpp @@ -0,0 +1,263 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::maxpool { + +struct Test::Attributes { + // 输入张量 + std::shared_ptr input; + std::shared_ptr expected_output; + + // 最大池化参数 + std::vector kernel_size; + std::vector stride; + std::vector padding; + bool ceil_mode; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { + throw std::runtime_error("Invalid Test"); + } + + auto input_tensor = tensors["input"]; + test->_attributes->input = tensors["input"]; + test->_attributes->expected_output = tensors["output"]; + + // 获取池化维度(输入张量维度 - 2,去掉batch和channel维度) + size_t pool_ndim = test->_attributes->input->shape().size() - 2; + if (pool_ndim == 0) { + throw std::runtime_error("Input tensor must have at least 3 dimensions (N, C, ...)"); + } + + // 解析并广播 kernel_size - 修复类型转换 + auto kernel_size_data = attributes["kernel_size"]; + if (kernel_size_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid kernel_size data size"); + } + size_t kernel_size_count = kernel_size_data.size() / sizeof(int); + const int *kernel_size_ptr = reinterpret_cast(kernel_size_data.data()); + + if (kernel_size_count == pool_ndim) { + test->_attributes->kernel_size.clear(); + for (size_t i = 0; i < kernel_size_count; i++) { + test->_attributes->kernel_size.push_back(static_cast(kernel_size_ptr[i])); + } + } else { + test->_attributes->kernel_size.assign(pool_ndim, static_cast(kernel_size_ptr[0])); + } + + // 解析并广播 stride + auto stride_data = attributes["stride"]; + if (stride_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid stride data size"); + } + size_t stride_count = stride_data.size() / sizeof(int); + const int *stride_ptr = reinterpret_cast(stride_data.data()); + + if (stride_count == pool_ndim) { + // 直接使用提供的值 + test->_attributes->stride.clear(); + for (size_t i = 0; i < stride_count; i++) { + test->_attributes->stride.push_back(static_cast(stride_ptr[i])); + } + } else { + // 广播单个值到所有维度 + test->_attributes->stride.assign(pool_ndim, static_cast(stride_ptr[0])); + } + + // 解析并广播 padding + auto padding_data = attributes["padding"]; + if (padding_data.size() % sizeof(int) != 0) { + throw std::runtime_error("Invalid padding data size"); + } + size_t padding_count = padding_data.size() / sizeof(int); + const int *padding_ptr = reinterpret_cast(padding_data.data()); + + if (padding_count == pool_ndim) { + test->_attributes->padding.clear(); + for (size_t i = 0; i < padding_count; i++) { + test->_attributes->padding.push_back(static_cast(padding_ptr[i])); + } + } else { + test->_attributes->padding.assign(pool_ndim, static_cast(padding_ptr[0])); + } + + // 解析 ceil_mode + auto ceil_mode_data = attributes["ceil_mode"]; + if (ceil_mode_data.size() == sizeof(bool)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()); + } else if (ceil_mode_data.size() == sizeof(uint8_t)) { + test->_attributes->ceil_mode = *reinterpret_cast(ceil_mode_data.data()) != 0; + } else { + throw std::runtime_error("Invalid ceil_mode data size"); + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopMaxPoolDescriptor_t op_desc; + + auto input = _attributes->input->to(device, device_id); + auto expected_output = _attributes->expected_output; + + auto input_dtype = input->ggml_type(); + + auto output_shape = expected_output->shape(); + + size_t output_size = 1; + for (auto dim : output_shape) { + output_size *= dim; + } + output_size *= ggmlTypeSize(input_dtype); + + auto output_memory = std::make_shared(output_size, device, device_id); + std::vector output_strides(output_shape.size()); + + if (output_shape.size() > 0) { + output_strides[output_shape.size() - 1] = 1; + for (int i = static_cast(output_shape.size()) - 2; i >= 0; i--) { + output_strides[i] = output_strides[i + 1] * output_shape[i + 1]; + } + } + + auto actual_output = std::make_shared( + output_memory, 0, output_shape, output_strides, input_dtype); + + // 准备参数指针 + void *kernel_size_ptr = _attributes->kernel_size.data(); + void *stride_ptr = _attributes->stride.data(); + void *padding_ptr = _attributes->padding.data(); + + // 创建算子描述符 + CHECK_OR(infiniopCreateMaxPoolDescriptor( + handle, &op_desc, + actual_output->desc(), + input->desc(), + kernel_size_ptr, + stride_ptr, + padding_ptr, + _attributes->ceil_mode), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create maxpool descriptor.")); + + // 获取工作空间大小 + size_t workspace_size; + CHECK_OR(infiniopGetMaxPoolWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + + // 分配工作空间 + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + } + + // 执行最大池化 + CHECK_OR(infiniopMaxPool( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during maxpool execution.")); + + // 验证结果 + try { + allClose(actual_output, expected_output, _rtol, _atol); + } catch (const std::exception &e) { + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyMaxPoolDescriptor(op_desc); + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + // 性能测试 + double elapsed_time = benchmark( + [=]() { + infiniopMaxPool( + op_desc, workspace, workspace_size, + actual_output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + // 清理资源 + if (workspace) { + infinirtFree(workspace); + } + infiniopDestroyMaxPoolDescriptor(op_desc); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"kernel_size", "stride", "padding", "ceil_mode"}; +} + +std::vector Test::tensor_names() { + return {"input", "output"}; +} + +std::vector Test::output_names() { + return {}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- expected_output: " << _attributes->expected_output->info() << std::endl; + + oss << "- kernel_size: ["; + for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->kernel_size[i]; + } + oss << "]" << std::endl; + + oss << "- stride: ["; + for (size_t i = 0; i < _attributes->stride.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->stride[i]; + } + oss << "]" << std::endl; + + oss << "- padding: ["; + for (size_t i = 0; i < _attributes->padding.size(); ++i) { + if (i > 0) { + oss << ", "; + } + oss << _attributes->padding[i]; + } + oss << "]" << std::endl; + + oss << "- ceil_mode: " << (_attributes->ceil_mode ? "true" : "false") << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::maxpool diff --git a/src/infiniop-test/src/ops/scatter.cpp b/src/infiniop-test/src/ops/scatter.cpp new file mode 100644 index 000000000..691e338da --- /dev/null +++ b/src/infiniop-test/src/ops/scatter.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::scatter { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr input; + std::shared_ptr index; + size_t dim; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("index") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->index = tensors["index"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->dim = *reinterpret_cast(attributes["dim"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopScatterDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto index = _attributes->index->to(device, device_id); + auto dim = _attributes->dim; + CHECK_OR(infiniopCreateScatterDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + index->desc(), + dim + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetScatterWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopScatter(op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopScatter( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"output", "input", "index", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- index: " << _attributes->index->info() << std::endl; + oss << "- dim: " << _attributes->dim << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::scatter diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp new file mode 100644 index 000000000..e1406e588 --- /dev/null +++ b/src/infiniop-test/src/ops/sin.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::sin { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopSinDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateSinDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetSinWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopSin(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopSin( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::sin diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp new file mode 100644 index 000000000..6f966de09 --- /dev/null +++ b/src/infiniop-test/src/ops/tanh.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::tanh { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopTanhDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopTanh( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::tanh diff --git a/src/infiniop-test/src/ops/where.cpp b/src/infiniop-test/src/ops/where.cpp new file mode 100644 index 000000000..fea9cba92 --- /dev/null +++ b/src/infiniop-test/src/ops/where.cpp @@ -0,0 +1,151 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::where { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr condition; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("condition") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->condition = tensors["condition"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->a->ggml_type(); + if (elemType == GGML_TYPE_I8) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I16) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I32) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I64) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_F64) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopWhereDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto condition = _attributes->condition->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateWhereDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc(), + condition->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetWhereWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopWhere(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + condition->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopWhere( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + condition->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "condition", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- condition: " << _attributes->condition->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::where diff --git a/src/infiniop/ops/averagepool/averagepool.h b/src/infiniop/ops/averagepool/averagepool.h new file mode 100644 index 000000000..7762826ab --- /dev/null +++ b/src/infiniop/ops/averagepool/averagepool.h @@ -0,0 +1,52 @@ +#ifndef __AVERAGEPOOL_H__ +#define __AVERAGEPOOL_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::averagepool::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + AvgPoolInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + AvgPoolInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + void *kernel_size, \ + void *strides, \ + void *pads, \ + bool ceil_mode); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __AVERAGEPOOL_H__ diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc new file mode 100644 index 000000000..2e8fa6851 --- /dev/null +++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc @@ -0,0 +1,362 @@ +#include "averagepool_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include +#include + +namespace op::averagepool::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + AvgPoolInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const AvgPoolInfo &avgpool_info) + : handle(handle_ptr), info(avgpool_info) { + workspace_size = 0; + } + + template + void _avgpool_1d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + const size_t input_nc_stride = input_width; + const size_t output_nc_stride = output_width; + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t input_offset = (b * channels + c) * input_nc_stride; + const size_t output_offset = (b * channels + c) * output_nc_stride; + + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int window_start = static_cast(ow * stride_width) - static_cast(pad_width); + const int window_end = window_start + static_cast(kernel_width); + + for (int iw = window_start; iw < window_end; ++iw) { + if (iw >= 0 && iw < static_cast(input_width)) { + sum += utils::cast(input[input_offset + iw]); + valid_count++; + } else if (iw >= -static_cast(pad_width) && + iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + output[output_offset + ow] = utils::cast(result); + } + } + } + } + + template + void _avgpool_2d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_height = info.pads[0]; + size_t pad_width = info.pads[1]; + + const size_t input_nc_stride = input_height * input_width; + const size_t output_nc_stride = output_height * output_width; + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t input_offset = (b * channels + c) * input_nc_stride; + const size_t output_offset = (b * channels + c) * output_nc_stride; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + const int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + + for (int kh = 0; kh < static_cast(kernel_height); ++kh) { + for (int kw = 0; kw < static_cast(kernel_width); ++kw) { + const int ih = start_h + kh; + const int iw = start_w + kw; + + if (ih >= 0 && ih < static_cast(input_height) && + iw >= 0 && iw < static_cast(input_width)) { + sum += utils::cast(input[input_offset + ih * input_width + iw]); + valid_count++; + } else if (ih >= -static_cast(pad_height) && + ih < static_cast(input_height + pad_height) && + iw >= -static_cast(pad_width) && + iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + output[output_offset + oh * output_width + ow] = utils::cast(result); + } + } + } + } + } + + template + void _avgpool_3d(Ydata *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_depth = info.kernel_sizes[0]; + size_t kernel_height = info.kernel_sizes[1]; + size_t kernel_width = info.kernel_sizes[2]; + size_t stride_depth = info.strides[0]; + size_t stride_height = info.strides[1]; + size_t stride_width = info.strides[2]; + size_t pad_depth = info.pads[0]; + size_t pad_height = info.pads[1]; + size_t pad_width = info.pads[2]; + + const size_t input_nc_stride = input_depth * input_height * input_width; + const size_t output_nc_stride = output_depth * output_height * output_width; + + #pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + const size_t input_offset = (b * channels + c) * input_nc_stride; + const size_t output_offset = (b * channels + c) * output_nc_stride; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + float sum = 0.0f; + int valid_count = 0; + + const int start_d = static_cast(od * stride_depth) - static_cast(pad_depth); + const int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + const int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + + for (int kd = 0; kd < static_cast(kernel_depth); ++kd) { + const int id = start_d + kd; + for (int kh = 0; kh < static_cast(kernel_height); ++kh) { + const int ih = start_h + kh; + for (int kw = 0; kw < static_cast(kernel_width); ++kw) { + const int iw = start_w + kw; + + if (id >= 0 && id < static_cast(input_depth) && + ih >= 0 && ih < static_cast(input_height) && + iw >= 0 && iw < static_cast(input_width)) { + const size_t idx = id * (input_height * input_width) + + ih * input_width + iw; + sum += utils::cast(input[input_offset + idx]); + valid_count++; + } else if (id >= -static_cast(pad_depth) && + id < static_cast(input_depth + pad_depth) && + ih >= -static_cast(pad_height) && + ih < static_cast(input_height + pad_height) && + iw >= -static_cast(pad_width) && + iw < static_cast(input_width + pad_width)) { + valid_count++; + } + } + } + } + + float result = 0.0f; + if (valid_count > 0) { + result = sum / static_cast(valid_count); + } + + const size_t out_idx = od * (output_height * output_width) + + oh * output_width + ow; + output[output_offset + out_idx] = utils::cast(result); + } + } + } + } + } + } + + template + void _avgpool_cpu(Ydata *output, const T *input) const { + switch (info.ndim) { + case 1: + _avgpool_1d(output, input); + break; + case 2: + _avgpool_2d(output, input); + break; + case 3: + _avgpool_3d(output, input); + break; + default: + break; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + AvgPoolInfo &info) { + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + size_t output_size = info.batch * info.channels; + for (size_t i = 0; i < info.ndim; ++i) { + output_size *= info.output_dims[i]; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + _avgpool_cpu(typed_output, typed_input); + break; + } + case INFINI_DTYPE_F16: { + float *typed_output_f32 = static_cast(workspace); + const fp16_t *typed_input = static_cast(input); + + _avgpool_cpu(typed_output_f32, typed_input); + + fp16_t *typed_output = static_cast(output); + #pragma omp parallel for + for(size_t i = 0; i < output_size; ++i) { + typed_output[i] = utils::cast(typed_output_f32[i]); + } + break; + } + case INFINI_DTYPE_BF16: { + float *typed_output_f32 = static_cast(workspace); + const bf16_t *typed_input = static_cast(input); + + _avgpool_cpu(typed_output_f32, typed_input); + + bf16_t *typed_output = static_cast(output); + #pragma omp parallel for + for(size_t i = 0; i < output_size; ++i) { + typed_output[i] = utils::cast(typed_output_f32[i]); + } + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +inline size_t calculateOutputSize(const AvgPoolInfo &info) { + size_t size = info.batch * info.channels; + for(size_t i = 0; i < info.ndim; ++i) { + size *= info.output_dims[i]; + } + return size; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle, info); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + size_t workspace_size = 0; + if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) { + workspace_size = calculateOutputSize(info) * sizeof(float); + } + + *desc_ptr = new Descriptor(dtype, std::move(info), workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::averagepool::cpu diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h new file mode 100644 index 000000000..8388f80ff --- /dev/null +++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_CPU_H__ +#define __AVERAGEPOOL_CPU_H__ + +#include "../averagepool.h" + +DESCRIPTOR(cpu) + +#endif // __AVERAGEPOOL_CPU_H__ diff --git a/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh new file mode 100644 index 000000000..7c9d0f438 --- /dev/null +++ b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh @@ -0,0 +1,185 @@ +#ifndef __AVERAGEPOOL_KERNEL_H__ +#define __AVERAGEPOOL_KERNEL_H__ + +#include + +// 1D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool1d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_length, + int output_length, int kernel_size, int stride, int padding) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= output_length) { + return; + } + + // 计算输入和输出的偏移 + const T *input_ptr = input + batch_idx * channels * input_length + channel_idx * input_length; + T *output_ptr = output + batch_idx * channels * output_length + channel_idx * output_length; + + // 计算池化窗口的起始位置 + int window_start = output_idx * stride - padding; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int k = 0; k < kernel_size; ++k) { + int input_pos = window_start + k; + + if (input_pos >= 0 && input_pos < input_length) { + // 有效的输入位置,转换为单精度进行累加 + sum += static_cast(input_ptr[input_pos]); + valid_count++; + } else if (input_pos >= -padding && input_pos < input_length + padding) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +// 2D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool2d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_height, + int input_width, int output_height, int output_width, int kernel_h, + int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + int total_output_elements = output_height * output_width; + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) { + return; + } + + // 将线性索引转换为2D坐标 + int out_h = output_idx / output_width; + int out_w = output_idx % output_width; + + // 计算输入和输出的偏移 + const T *input_ptr = input + batch_idx * channels * input_height * input_width + channel_idx * input_height * input_width; + T *output_ptr = output + batch_idx * channels * output_height * output_width + channel_idx * output_height * output_width; + + // 计算池化窗口的起始位置 + int window_start_h = out_h * stride_h - pad_h; + int window_start_w = out_w * stride_w - pad_w; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if (input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) { + // 有效的输入位置,转换为单精度进行累加 + int input_idx = input_h * input_width + input_w; + sum += static_cast(input_ptr[input_idx]); + valid_count++; + } else if (input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +// 3D平均池化kernel,兼容PyTorch的隐式填充逻辑 +template +__global__ void avgpool3d_pytorch_compatible_kernel( + const T *input, T *output, int batch_size, int channels, int input_depth, + int input_height, int input_width, int output_depth, int output_height, + int output_width, int kernel_d, int kernel_h, int kernel_w, int stride_d, + int stride_h, int stride_w, int pad_d, int pad_h, int pad_w) { + + int batch_idx = blockIdx.x; + int channel_idx = blockIdx.y; + int output_idx = blockIdx.z * blockDim.x + threadIdx.x; + + int total_output_elements = output_depth * output_height * output_width; + if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) { + return; + } + + // 将线性索引转换为3D坐标 + int out_d = output_idx / (output_height * output_width); + int remaining = output_idx % (output_height * output_width); + int out_h = remaining / output_width; + int out_w = remaining % output_width; + + // 计算输入和输出的偏移 + int input_spatial_size = input_depth * input_height * input_width; + int output_spatial_size = output_depth * output_height * output_width; + + const T *input_ptr = input + batch_idx * channels * input_spatial_size + channel_idx * input_spatial_size; + T *output_ptr = output + batch_idx * channels * output_spatial_size + channel_idx * output_spatial_size; + + // 计算池化窗口的起始位置 + int window_start_d = out_d * stride_d - pad_d; + int window_start_h = out_h * stride_h - pad_h; + int window_start_w = out_w * stride_w - pad_w; + + // 使用单精度进行中间计算 + float sum = 0.0f; + int valid_count = 0; + + // 遍历池化窗口 + for (int kd = 0; kd < kernel_d; ++kd) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int input_d = window_start_d + kd; + int input_h = window_start_h + kh; + int input_w = window_start_w + kw; + + if (input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) { + // 有效的输入位置,转换为单精度进行累加 + int input_idx = (input_d * input_height + input_h) * input_width + input_w; + sum += static_cast(input_ptr[input_idx]); + valid_count++; + } else if (input_d >= -pad_d && input_d < input_depth + pad_d && input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) { + // 显式填充区域,值为0,只增加计数 + valid_count++; + } + // 其他位置是隐式填充,不计入分母 + } + } + } + + // 计算平均值并转换回原始数据类型 + if (valid_count > 0) { + float result = sum / static_cast(valid_count); + output_ptr[output_idx] = static_cast(result); + } else { + output_ptr[output_idx] = T(0); + } +} + +#endif // __AVERAGEPOOL_KERNEL_H__ diff --git a/src/infiniop/ops/averagepool/info.h b/src/infiniop/ops/averagepool/info.h new file mode 100644 index 000000000..871e827a7 --- /dev/null +++ b/src/infiniop/ops/averagepool/info.h @@ -0,0 +1,136 @@ +#ifndef __AVERAGEPOOL_INFO_H__ +#define __AVERAGEPOOL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include +#include + +namespace op::averagepool { + +inline utils::Result calculatePoolOutputSize( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding = 0, + bool ceil_mode = false) { + + if (stride == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + if (kernel_size == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + + size_t padded_input_size = input_size + 2 * padding; + + if (padded_input_size < kernel_size) { + return utils::Result(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + + size_t output_size; + if (ceil_mode) { + // 等效于整数的上取整 + output_size = (padded_input_size - kernel_size + stride - 1) / stride + 1; + } else { + // 等效于整数的下取整 + output_size = (padded_input_size - kernel_size) / stride + 1; + } + + return utils::Result(output_size); +} + +// 检查是否存在隐式填充 +inline bool hasImplicitPadding( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding, + bool ceil_mode) { + + if (!ceil_mode) { + return false; + } + return ((input_size + 2 * padding) - kernel_size) % stride != 0; +} + +class AvgPoolInfo { + AvgPoolInfo() = default; + +public: + std::vector input_dims; + std::vector output_dims; + std::vector kernel_sizes; + std::vector strides; + std::vector pads; + bool ceil_mode; + size_t ndim; + size_t batch; + size_t channels; + bool has_implicit_padding = false; + + static utils::Result create( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + AvgPoolInfo info; + + if (input_desc->ndim() < 3 || input_desc->ndim() > 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() != output_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.ndim = input_desc->ndim() - 2; // 空间维度 + info.batch = input_desc->dim(0); + info.channels = input_desc->dim(1); + info.ceil_mode = ceil_mode; + + auto kernel_ptr = reinterpret_cast(kernel_size); + auto stride_ptr = reinterpret_cast(strides); + auto pad_ptr = reinterpret_cast(pads); + + // 初始化隐式填充标志 + info.has_implicit_padding = false; + + // 获取并校验空间维度 + for (size_t i = 0; i < info.ndim; ++i) { + info.input_dims.push_back(input_desc->dim(i + 2)); + info.kernel_sizes.push_back(kernel_ptr[i]); + info.strides.push_back(stride_ptr[i]); + info.pads.push_back(pad_ptr[i]); + + auto output_size_result = calculatePoolOutputSize( + info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode); + CHECK_RESULT(output_size_result); + + size_t expected_size = output_size_result.take(); + if (expected_size != output_desc->dim(i + 2)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.output_dims.push_back(output_desc->dim(i + 2)); + + // 检查当前维度是否存在隐式填充 + if (hasImplicitPadding(info.input_dims[i], info.kernel_sizes[i], + info.strides[i], info.pads[i], info.ceil_mode)) { + info.has_implicit_padding = true; + } + } + return utils::Result(std::move(info)); + } +}; +} // namespace op::averagepool + +#endif // __AVERAGEPOOL_INFO_H__ diff --git a/src/infiniop/ops/averagepool/metax/averagepool_metax.h b/src/infiniop/ops/averagepool/metax/averagepool_metax.h new file mode 100644 index 000000000..eef332b5f --- /dev/null +++ b/src/infiniop/ops/averagepool/metax/averagepool_metax.h @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_METAX_H__ +#define __AVERAGEPOOL_METAX_H__ + +#include "../averagepool.h" + +DESCRIPTOR(metax) + +#endif // __AVERAGEPOOL_METAX_CUH__ diff --git a/src/infiniop/ops/averagepool/metax/averagepool_metax.maca b/src/infiniop/ops/averagepool/metax/averagepool_metax.maca new file mode 100644 index 000000000..ee3c4bd9c --- /dev/null +++ b/src/infiniop/ops/averagepool/metax/averagepool_metax.maca @@ -0,0 +1,332 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "averagepool_metax.h" +#include "../cuda/averagepool_kernel.cuh" +#include + +infiniStatus_t launch_avgpool_pytorch_kernel( + const op::averagepool::AvgPoolInfo& info, + const void* input, void* output, + infiniDtype_t data_type, hcStream_t stream) { + + int batch_size = static_cast(info.batch); + int channels = static_cast(info.channels); + + if (info.ndim == 1) { + // 1D平均池化 + int input_length = static_cast(info.input_dims[0]); + int output_length = static_cast(info.output_dims[0]); + int kernel_size = static_cast(info.kernel_sizes[0]); + int stride = static_cast(info.strides[0]); + int padding = static_cast(info.pads[0]); + + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (output_length + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool1d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + case INFINI_DTYPE_F16: + avgpool1d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + case INFINI_DTYPE_BF16: + avgpool1d_pytorch_compatible_kernel<__hpcc_bfloat16><<>>( + static_cast(input), static_cast<__hpcc_bfloat16*>(output), + batch_size, channels, input_length, output_length, + kernel_size, stride, padding); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else if (info.ndim == 2) { + // 2D平均池化 + int input_height = static_cast(info.input_dims[0]); + int input_width = static_cast(info.input_dims[1]); + int output_height = static_cast(info.output_dims[0]); + int output_width = static_cast(info.output_dims[1]); + int kernel_h = static_cast(info.kernel_sizes[0]); + int kernel_w = static_cast(info.kernel_sizes[1]); + int stride_h = static_cast(info.strides[0]); + int stride_w = static_cast(info.strides[1]); + int pad_h = static_cast(info.pads[0]); + int pad_w = static_cast(info.pads[1]); + + int total_output_elements = output_height * output_width; + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool2d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + case INFINI_DTYPE_F16: + avgpool2d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + case INFINI_DTYPE_BF16: + avgpool2d_pytorch_compatible_kernel<__hpcc_bfloat16><<>>( + static_cast(input), static_cast<__hpcc_bfloat16*>(output), + batch_size, channels, input_height, input_width, + output_height, output_width, kernel_h, kernel_w, + stride_h, stride_w, pad_h, pad_w); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else if (info.ndim == 3) { + // 3D平均池化 + int input_depth = static_cast(info.input_dims[0]); + int input_height = static_cast(info.input_dims[1]); + int input_width = static_cast(info.input_dims[2]); + int output_depth = static_cast(info.output_dims[0]); + int output_height = static_cast(info.output_dims[1]); + int output_width = static_cast(info.output_dims[2]); + int kernel_d = static_cast(info.kernel_sizes[0]); + int kernel_h = static_cast(info.kernel_sizes[1]); + int kernel_w = static_cast(info.kernel_sizes[2]); + int stride_d = static_cast(info.strides[0]); + int stride_h = static_cast(info.strides[1]); + int stride_w = static_cast(info.strides[2]); + int pad_d = static_cast(info.pads[0]); + int pad_h = static_cast(info.pads[1]); + int pad_w = static_cast(info.pads[2]); + + int total_output_elements = output_depth * output_height * output_width; + dim3 blockSize(256); + dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x); + + switch (data_type) { + case INFINI_DTYPE_F32: + avgpool3d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + case INFINI_DTYPE_F16: + avgpool3d_pytorch_compatible_kernel<<>>( + static_cast(input), static_cast(output), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + case INFINI_DTYPE_BF16: + avgpool3d_pytorch_compatible_kernel<__hpcc_bfloat16><<>>( + static_cast(input), static_cast<__hpcc_bfloat16*>(output), + batch_size, channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w); + break; + default: + return INFINI_STATUS_NOT_IMPLEMENTED; + } + + } else { + return INFINI_STATUS_BAD_PARAM; + } + + return INFINI_STATUS_SUCCESS; +} + +#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_hcdnn_DESCRIPTORS() \ + do { \ + DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(output_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(pooling_desc, hcdnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::averagepool::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_HCDNN_API + hcdnnTensorDescriptor_t input_desc = nullptr; + hcdnnTensorDescriptor_t output_desc = nullptr; + hcdnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_HCDNN_API + infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info, + hcdnnDataType_t hcdnn_data_type) { + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&output_desc)); + CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_desc)); + + std::vector input_dims = {static_cast(info.batch), static_cast(info.channels)}; + std::vector output_dims = {static_cast(info.batch), static_cast(info.channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + input_dims.push_back(static_cast(info.input_dims[i])); + output_dims.push_back(static_cast(info.output_dims[i])); + } + while (input_dims.size() < 5) input_dims.push_back(1); + while (output_dims.size() < 5) output_dims.push_back(1); + std::vector input_strides(input_dims.size(), 1); + std::vector output_strides(output_dims.size(), 1); + for (int i = input_dims.size() - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + output_strides[i] = output_strides[i + 1] * output_dims[i + 1]; + } + + CHECK_MCDNN(hcdnnSetTensorNdDescriptor(input_desc, hcdnn_data_type, + input_dims.size(), input_dims.data(), input_strides.data())); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor(output_desc, hcdnn_data_type, + output_dims.size(), output_dims.data(), output_strides.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) { + std::vector kernel_size, strides, pads; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_size.push_back(static_cast(info.kernel_sizes[i])); + strides.push_back(static_cast(info.strides[i])); + pads.push_back(static_cast(info.pads[i])); + } + while (kernel_size.size() < 3) kernel_size.push_back(1); + while (strides.size() < 3) strides.push_back(1); + while (pads.size() < 3) pads.push_back(0); + CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(pooling_desc, HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + HCDNN_NOT_PROPAGATE_NAN, kernel_size.size(), + kernel_size.data(), pads.data(), strides.data())); + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializehcdnnContext(AvgPoolInfo &info, + infiniDtype_t data_type) { + hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type); + CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + workspace_size = 0; + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_HCDNN_API + , input_desc(other.input_desc) + , output_desc(other.output_desc) + , pooling_desc(other.pooling_desc) +#endif + { +#ifdef ENABLE_HCDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_HCDNN_API + CLEANUP_hcdnn_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + AvgPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_HCDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializehcdnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { +#ifdef ENABLE_HCDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { +#ifdef ENABLE_HCDNN_API + if (_info.has_implicit_padding) { + // 使用自定义kernel实现PyTorch兼容的逻辑 + return launch_avgpool_pytorch_kernel(_info, input, output, _dtype, (hcStream_t)stream); + } else { + const float alpha = 1.0f, beta = 0.0f; + CHECK_STATUS(_opaque->internal->useMcdnn( + (hcStream_t)stream, [&](hcdnnHandle_t handle) { + CHECK_MCDNN(hcdnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + return INFINI_STATUS_SUCCESS; + } +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::averagepool::metax diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool.cu b/src/infiniop/ops/averagepool/nvidia/averagepool.cu new file mode 100644 index 000000000..6f276aac8 --- /dev/null +++ b/src/infiniop/ops/averagepool/nvidia/averagepool.cu @@ -0,0 +1,220 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "averagepool_nvidia.cuh" + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::averagepool::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t output_desc = nullptr; + cudnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_F32) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info, + cudnnDataType_t cudnn_data_type) { + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc)); + + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + if (info.ndim == 1) { + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(), + output_dims_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) { + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + if (info.ndim == 1) { + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + pooling_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, + CUDNN_NOT_PROPAGATE_NAN, kernel_vec.size(), kernel_vec.data(), + pad_vec.data(), stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeCudnnContext(AvgPoolInfo &info, + infiniDtype_t data_type) { + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + + CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // Average pooling typically doesn't need a workspace + workspace_size = 0; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) + // clang-format off +#ifdef ENABLE_CUDNN_API + , input_desc(other.input_desc) + , output_desc(other.output_desc) + , pooling_desc(other.pooling_desc) +#endif + // clang-format on + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + AvgPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + + CHECK_STATUS(_opaque->internal->useCudnn( + (cudaStream_t)stream, [&](cudnnHandle_t handle) { + CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::averagepool::nvidia diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh new file mode 100644 index 000000000..ef19aa1dc --- /dev/null +++ b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __AVERAGEPOOL_CUDA_CUH__ +#define __AVERAGEPOOL_CUDA_CUH__ + +#include "../averagepool.h" + +DESCRIPTOR(nvidia) + +#endif // __AVERAGEPOOL_CUDA_CUH__ diff --git a/src/infiniop/ops/averagepool/operator.cc b/src/infiniop/ops/averagepool/operator.cc new file mode 100644 index 000000000..c664504d5 --- /dev/null +++ b/src/infiniop/ops/averagepool/operator.cc @@ -0,0 +1,155 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/averagepool.h" + +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/averagepool_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/averagepool_metax.h" +#endif +#ifdef ENABLE_CPU_API +#include "cpu/averagepool_cpu.h" +#endif + +__C infiniStatus_t infiniopCreateAvgPoolDescriptor( + infiniopHandle_t handle, + infiniopAvgPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::averagepool::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + kernel_size, \ + strides, \ + pads, \ + ceil_mode) + + switch (handle->device) { + +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize( + infiniopAvgPoolDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopAvgPool( + infiniopAvgPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, \ + output, \ + input, \ + stream) + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/batch_norm/batch_norm.h b/src/infiniop/ops/batch_norm/batch_norm.h new file mode 100644 index 000000000..b550a752b --- /dev/null +++ b/src/infiniop/ops/batch_norm/batch_norm.h @@ -0,0 +1,56 @@ +#ifndef __BATCH_NORM_H__ +#define __BATCH_NORM_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::batch_norm::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + BatchNormInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + BatchNormInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t running_mean_desc, \ + infiniopTensorDescriptor_t running_var_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t weight_desc, \ + infiniopTensorDescriptor_t bias_desc, \ + float momentum, \ + float eps \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + void * running_mean, \ + void * running_var, \ + const void * input, \ + const void * weight, \ + const void * bias, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc new file mode 100644 index 000000000..69c563c3a --- /dev/null +++ b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.cc @@ -0,0 +1,129 @@ +#include "batch_norm_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" + +namespace op::batch_norm::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps +) { + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + auto result = BatchNormInfo::createBatchNormInfo( + output_desc, + running_mean_desc, + running_var_desc, + input_desc, + weight_desc, + bias_desc, + momentum, + eps + ); + CHECK_RESULT(result); + const BatchNormInfo &info = result.take(); + size_t WorkSpaceSize = 0; + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t calculate_batch_norm( + const BatchNormInfo &info, + Tdata * output, + Tdata * running_mean, + Tdata * running_var, + const Tdata *input, + const Tdata *weight, + const Tdata *bias +) { + +#pragma omp parallel for + for(size_t c = 0; c < info.channel_size; c++) + { + float sum_sq = 0., sum=0.; + for(size_t b = 0; b < info.batch_size; b++) + { + sum += op::common_cpu::reduce_op::sum( + input + (b * info.channel_size + c) * info.dim_size, + info.dim_size, + 1 + ); + sum_sq += op::common_cpu::reduce_op::sumSquared( + input + (b * info.channel_size + c) * info.dim_size, + info.dim_size, + 1 + ); + } + float batch_and_dim_size = (info.batch_size * info.dim_size); + float E = sum / batch_and_dim_size; + float var_biased = sum_sq / batch_and_dim_size - E * E; + float var_unbiased = var_biased * batch_and_dim_size / (batch_and_dim_size - 1.0); + + auto running_mean_ptr = running_mean + c * info.running_mean_stride; + auto running_var_ptr = running_var + c * info.running_var_stride; + *running_mean_ptr = utils::cast((1 - info.momentum) * utils::cast(*running_mean_ptr) + info.momentum * E); + *running_var_ptr = utils::cast((1 - info.momentum) * utils::cast(*running_var_ptr) + info.momentum * var_unbiased); + + for(size_t b = 0; b < info.batch_size; b++) + { + for(size_t d = 0; d < info.dim_size; d++) + { + auto input_ptr = input + ((b * info.channel_size + c) * info.dim_size) + d; + auto output_ptr = output + ((b * info.channel_size + c) * info.dim_size) + d;; + auto weight_ptr = weight + c * info.weight_stride; + auto bias_ptr = bias + c * info.bias_stride; + *output_ptr = utils::cast( + (utils::cast(*input_ptr) - E) / std::sqrt(var_biased + info.eps) * utils::cast(*weight_ptr) + utils::cast(*bias_ptr) + ); + } + } + } + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE_BATCH_NORM(TDATA) \ + CHECK_STATUS(calculate_batch_norm(_info, \ +(TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias)) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + void * running_mean, + void * running_var, + const void * input, + const void * weight, + const void * bias, + void *stream +) const { + + if (_info.dtype == INFINI_DTYPE_F16) { + CALCULATE_BATCH_NORM(fp16_t); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CALCULATE_BATCH_NORM(bf16_t); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CALCULATE_BATCH_NORM(float); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::cpu diff --git a/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h new file mode 100644 index 000000000..ac38987ef --- /dev/null +++ b/src/infiniop/ops/batch_norm/cpu/batch_norm_cpu.h @@ -0,0 +1,9 @@ +#ifndef __BATCH_NORM_CPU_H__ +#define __BATCH_NORM_CPU_H__ + +#include "../batch_norm.h" + +DESCRIPTOR(cpu) + + +#endif // __BATCH_NORM_CPU_H__ diff --git a/src/infiniop/ops/batch_norm/cuda/kernel.cuh b/src/infiniop/ops/batch_norm/cuda/kernel.cuh new file mode 100644 index 000000000..952d7ef79 --- /dev/null +++ b/src/infiniop/ops/batch_norm/cuda/kernel.cuh @@ -0,0 +1,64 @@ +#ifndef __BATCH_NORM_KERNEL_CUH__ +#define __BATCH_NORM_KERNEL_CUH__ + +#include +#include "../../../reduce/cuda/reduce.cuh" + +template +__device__ void batchNormKernel( + Tdata * output, + Tdata * running_mean, + Tdata * running_var, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + + size_t batch_size, + size_t channel_size, + size_t dim_size, + ptrdiff_t running_mean_stride, + ptrdiff_t running_var_stride, + ptrdiff_t weight_stride, + ptrdiff_t bias_stride, + float momentum, + float eps +) { + auto output_ptr = output + dim_size * blockIdx.x; + auto input_ptr = input + dim_size * blockIdx.x; + + auto running_mean_ptr = running_mean + running_mean_stride * blockIdx.x; + auto running_var_ptr = running_var + running_var_stride * blockIdx.x; + auto weight_ptr = weight + weight_stride * blockIdx.x; + auto bias_ptr = bias + bias_stride * blockIdx.x; + + Tcompute sum_squared = 0., sum = 0.; + for(size_t b = 0; b < batch_size; b++) + { + sum += op::common_cuda::reduce_op::sum( + input_ptr + b * (channel_size * dim_size), dim_size + ); + sum_squared += op::common_cuda::reduce_op::sumSquared( + input_ptr + b * (channel_size * dim_size), dim_size + ); + } + + __shared__ Tcompute E, var_biased; + if (threadIdx.x == 0) { + E = sum / Tcompute(batch_size * dim_size); + var_biased = sum_squared / Tcompute(batch_size * dim_size) - E * E; + Tcompute var_unbiased = var_biased * Tcompute(batch_size * dim_size) / Tcompute(batch_size * dim_size - 1); + *running_mean_ptr = Tcompute(1 - momentum) * Tcompute(*running_mean_ptr) + Tcompute(momentum) * E; + *running_var_ptr = Tcompute(1 - momentum) * Tcompute(*running_var_ptr) + Tcompute(momentum) * var_unbiased; + } + __syncthreads(); + + for (size_t n = threadIdx.x; n < batch_size * dim_size; n += BLOCK_SIZE) + { + size_t b = n / dim_size, d = n % dim_size; + *(output_ptr + b * channel_size * dim_size + d) = ( + Tcompute(*(input_ptr + b * channel_size * dim_size + d)) - E + ) / sqrtf(float(var_biased + Tcompute(eps))) * Tcompute(*weight_ptr) + Tcompute(*bias_ptr); + } +} + +#endif // __BATCH_NORM_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/batch_norm/info.h b/src/infiniop/ops/batch_norm/info.h new file mode 100644 index 000000000..f78359077 --- /dev/null +++ b/src/infiniop/ops/batch_norm/info.h @@ -0,0 +1,73 @@ +#ifndef __BATCH_NORM_INFO_H__ +#define __BATCH_NORM_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::batch_norm { + +class BatchNormInfo { +private: + BatchNormInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t batch_size, channel_size, dim_size; + + ptrdiff_t running_mean_stride; + ptrdiff_t running_var_stride; + ptrdiff_t weight_stride; + ptrdiff_t bias_stride; + float momentum; + float eps; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createBatchNormInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN( + input_desc->ndim() == 3, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + CHECK_SAME_SHAPE(output_desc->shape(), input_desc->shape()); + size_t batch_size = output_desc->dim(0), + channel_size = output_desc->dim(1), + dim_size = output_desc->dim(2); + CHECK_SAME_SHAPE( + running_mean_desc->shape(), running_var_desc->shape(), + weight_desc->shape(), bias_desc->shape() + ); + CHECK_OR_RETURN( + running_mean_desc->ndim() == 1 && running_mean_desc->dim(0) == channel_size, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(BatchNormInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + batch_size, channel_size, dim_size, + running_mean_desc->stride(0), + running_var_desc->stride(0), + weight_desc->stride(0), + bias_desc->stride(0), + momentum, + eps +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __BATCH_NORM_INFO_H__ diff --git a/src/infiniop/ops/batch_norm/metax/batch_norm_metax.h b/src/infiniop/ops/batch_norm/metax/batch_norm_metax.h new file mode 100644 index 000000000..0c2d8c800 --- /dev/null +++ b/src/infiniop/ops/batch_norm/metax/batch_norm_metax.h @@ -0,0 +1,8 @@ +#ifndef __BATCH_NORM_METAX_H__ +#define __BATCH_NORM_METAX_H__ + +#include "../batch_norm.h" + +DESCRIPTOR(metax) + +#endif // __BATCH_NORM_METAX_H__ diff --git a/src/infiniop/ops/batch_norm/metax/batch_norm_metax.maca b/src/infiniop/ops/batch_norm/metax/batch_norm_metax.maca new file mode 100644 index 000000000..0fa8e22a2 --- /dev/null +++ b/src/infiniop/ops/batch_norm/metax/batch_norm_metax.maca @@ -0,0 +1,181 @@ +#include "../../../devices/metax/metax_common.h" +#include "batch_norm_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::batch_norm::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + Tdata * running_mean, + Tdata * running_var, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + + size_t batch_size, + size_t channel_size, + size_t dim_size, + ptrdiff_t running_mean_stride, + ptrdiff_t running_var_stride, + ptrdiff_t weight_stride, + ptrdiff_t bias_stride, + + float momentum, + float eps +) { + batchNormKernel( + output, + running_mean, + running_var, + input, + weight, + bias, + + batch_size, + channel_size, + dim_size, + + running_mean_stride, + running_var_stride, + weight_stride, + bias_stride, + + momentum, + eps + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_batch_norm( + const BatchNormInfo &info, + Tdata * output, + Tdata * running_mean, + Tdata * running_var, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + hcStream_t stream +) { + launchKernel<<>>( + output, + running_mean, + running_var, + input, + weight, + bias, + + info.batch_size, + info.channel_size, + info.dim_size, + + info.running_mean_stride, + info.running_var_stride, + info.weight_stride, + info.bias_stride, + info.momentum, + info.eps + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + auto result = BatchNormInfo::createBatchNormInfo( + output_desc, + running_mean_desc, + running_var_desc, + input_desc, + weight_desc, + bias_desc, + momentum, + eps + ); + CHECK_RESULT(result); + const BatchNormInfo &info = result.take(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + void * running_mean, + void * running_var, + const void * input, + const void * weight, + const void * bias, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_BATCH_NORM(BLOCK_SIZE, TDATA) \ + calculate_batch_norm(_info, (TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream) + #define CALCULATE_BATCH_NORM_WITH_MATEX_BLOCK(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, cuda_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_BATCH_NORM_WITH_MATEX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_BATCH_NORM_WITH_MATEX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + + #undef CALCULATE_BATCH_NORM_WITH_MATEX_BLOCK + #undef CALCULATE_BATCH_NORM + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::batch_norm::metax diff --git a/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu new file mode 100644 index 000000000..b79fed65f --- /dev/null +++ b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cu @@ -0,0 +1,186 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "batch_norm_nvidia.cuh" + +#include +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" + + +#include "../info.h" + +namespace op::batch_norm::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * output, + Tdata * running_mean, + Tdata * running_var, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + + size_t batch_size, + size_t channel_size, + size_t dim_size, + ptrdiff_t running_mean_stride, + ptrdiff_t running_var_stride, + ptrdiff_t weight_stride, + ptrdiff_t bias_stride, + + float momentum, + float eps +) { + + batchNormKernel( + output, + running_mean, + running_var, + input, + weight, + bias, + + batch_size, + channel_size, + dim_size, + + running_mean_stride, + running_var_stride, + weight_stride, + bias_stride, + + momentum, + eps + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_batch_norm( + const BatchNormInfo &info, + Tdata * output, + Tdata * running_mean, + Tdata * running_var, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + + cudaStream_t stream +) { + launchKernel<<>>( + output, + running_mean, + running_var, + input, + weight, + bias, + + info.batch_size, + info.channel_size, + info.dim_size, + + info.running_mean_stride, + info.running_var_stride, + info.weight_stride, + info.bias_stride, + info.momentum, + info.eps + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = BatchNormInfo::createBatchNormInfo( + output_desc, + running_mean_desc, + running_var_desc, + input_desc, + weight_desc, + bias_desc, + momentum, + eps + ); + CHECK_RESULT(result); + const BatchNormInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + void * running_mean, + void * running_var, + const void * input, + const void * weight, + const void * bias, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_BATCH_NORM(BLOCK_SIZE, TDATA) \ + calculate_batch_norm(_info, (TDATA *)output, (TDATA *)running_mean, (TDATA *)running_var, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream) + #define CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_BATCH_NORM(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) + CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) + CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) + CALCULATE_BATCH_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::batch_norm::nvidia diff --git a/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh new file mode 100644 index 000000000..33c93f2b4 --- /dev/null +++ b/src/infiniop/ops/batch_norm/nvidia/batch_norm_nvidia.cuh @@ -0,0 +1,10 @@ +#ifndef __BATCH_NORM_NVIDIA_API_H__ +#define __BATCH_NORM_NVIDIA_API_H__ + +// #ifdef ENABLE_NINETOOTHED +#include "../batch_norm.h" +DESCRIPTOR(nvidia) + +// #endif + +#endif // __BATCH_NORM_NVIDIA_API_H__ diff --git a/src/infiniop/ops/batch_norm/operator.cc b/src/infiniop/ops/batch_norm/operator.cc new file mode 100644 index 000000000..81418b2d2 --- /dev/null +++ b/src/infiniop/ops/batch_norm/operator.cc @@ -0,0 +1,168 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/batch_norm.h" + +#ifdef ENABLE_CPU_API +#include "cpu/batch_norm_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/batch_norm_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/batch_norm_metax.h" +#endif + +__C infiniStatus_t infiniopCreateBatchNormDescriptor( + infiniopHandle_t handle, + infiniopBatchNormDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t running_mean_desc, + infiniopTensorDescriptor_t running_var_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float momentum, + float eps +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::batch_norm::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + running_mean_desc, \ + running_var_desc, \ + input_desc, \ + weight_desc, \ + bias_desc, \ + momentum, \ + eps \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetBatchNormWorkspaceSize(infiniopBatchNormDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopBatchNorm( + infiniopBatchNormDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + void * running_mean, + void * running_var, + const void * input, + const void * weight, + const void * bias, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + running_mean, \ + running_var, \ + input, \ + weight, \ + bias, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyBatchNormDescriptor(infiniopBatchNormDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc new file mode 100644 index 000000000..af97c1d09 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc @@ -0,0 +1,321 @@ +#include "cross_entropy_loss_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include + +namespace op::cross_entropy_loss::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + std::vector logits_shape; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const std::vector &shape) + : handle(handle_ptr), logits_shape(shape) { + // 计算workspace大小:需要存储per-sample loss + size_t N = logits_shape[0]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + workspace_size = N * inner_size * sizeof(float); + } + + void cross_entropy_f16_as_float(float *workspace, float *loss_result, + const fp16_t *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + // 转换F16 logits为float + size_t total_logits_size = N * C * inner_size; + std::vector float_logits(total_logits_size); + for (size_t i = 0; i < total_logits_size; ++i) { + float_logits[i] = utils::cast(logits[i]); + } + + // 使用float精度计算 + cross_entropy_cpu_float(workspace, loss_result, float_logits.data(), target); + } + + // 通用的float版本交叉熵计算 + void cross_entropy_cpu_float(float *workspace, float *loss_result, + const float *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + const int64_t ignore_index = -100; + float *per_sample_loss = workspace; + + // 计算每个样本的损失 + for (size_t n = 0; n < N; ++n) { + for (size_t inner = 0; inner < inner_size; ++inner) { + size_t sample_idx = n * inner_size + inner; + int64_t t = target[sample_idx]; + + // 检查ignore_index或无效target + if (t == ignore_index || t < 0 || t >= static_cast(C)) { + per_sample_loss[sample_idx] = 0.0f; + continue; + } + + // 计算这个位置的logits基址 + size_t base_offset = n * C * inner_size + inner; + + // 数值稳定的softmax计算:先找最大值 + float max_logit = -std::numeric_limits::infinity(); + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + max_logit = std::max(max_logit, logits[logit_idx]); + } + + // 计算exp的和(减去最大值保证数值稳定) + float sum_exp = 0.0f; + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + sum_exp += std::exp(logits[logit_idx] - max_logit); + } + + // 计算目标类别的logit + size_t target_logit_idx = base_offset + static_cast(t) * inner_size; + float target_logit = logits[target_logit_idx]; + + // 计算交叉熵损失:log_softmax[target] = logit[target] - log(sum_exp) - max_logit + // 所以 -log_softmax[target] = log(sum_exp) + max_logit - logit[target] + per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit; + } + } + + // 计算平均损失(忽略ignore_index的样本) + double total_loss = 0.0; + size_t valid_count = 0; + size_t total_samples = N * inner_size; + + for (size_t i = 0; i < total_samples; ++i) { + if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast(C)) { + total_loss += static_cast(per_sample_loss[i]); + valid_count++; + } + } + + *loss_result = valid_count > 0 ? static_cast(total_loss / valid_count) : 0.0f; + } + + // 通用模板版本(用于F32和BF16) + template + void cross_entropy_cpu_generic(float *workspace, T *loss_result, + const T *logits, const int64_t *target) const { + size_t N = logits_shape[0]; + size_t C = logits_shape[1]; + size_t inner_size = 1; + for (size_t i = 2; i < logits_shape.size(); ++i) { + inner_size *= logits_shape[i]; + } + + const int64_t ignore_index = -100; + float *per_sample_loss = workspace; + + // 计算每个样本的损失 + for (size_t n = 0; n < N; ++n) { + for (size_t inner = 0; inner < inner_size; ++inner) { + size_t sample_idx = n * inner_size + inner; + int64_t t = target[sample_idx]; + + // 检查ignore_index或无效target + if (t == ignore_index || t < 0 || t >= static_cast(C)) { + per_sample_loss[sample_idx] = 0.0f; + continue; + } + + // 计算这个位置的logits基址 + size_t base_offset = n * C * inner_size + inner; + + // 数值稳定的softmax计算:先找最大值 + float max_logit = -std::numeric_limits::infinity(); + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + float logit_val; + if constexpr (std::is_same::value) { + logit_val = utils::cast(logits[logit_idx]); + } else { + logit_val = logits[logit_idx]; + } + max_logit = std::max(max_logit, logit_val); + } + + // 计算exp的和 + float sum_exp = 0.0f; + for (size_t c = 0; c < C; ++c) { + size_t logit_idx = base_offset + c * inner_size; + float logit_val; + if constexpr (std::is_same::value) { + logit_val = utils::cast(logits[logit_idx]); + } else { + logit_val = logits[logit_idx]; + } + sum_exp += std::exp(logit_val - max_logit); + } + + // 计算目标类别的logit + size_t target_logit_idx = base_offset + static_cast(t) * inner_size; + float target_logit; + if constexpr (std::is_same::value) { + target_logit = utils::cast(logits[target_logit_idx]); + } else { + target_logit = logits[target_logit_idx]; + } + + // 计算交叉熵损失 + per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit; + } + } + + // 计算平均损失 + double total_loss = 0.0; + size_t valid_count = 0; + size_t total_samples = N * inner_size; + + for (size_t i = 0; i < total_samples; ++i) { + if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast(C)) { + total_loss += static_cast(per_sample_loss[i]); + valid_count++; + } + } + + float mean_loss = valid_count > 0 ? static_cast(total_loss / valid_count) : 0.0f; + + // 转换回输出类型 + if constexpr (std::is_same::value) { + *loss_result = utils::cast(mean_loss); + } else { + *loss_result = static_cast(mean_loss); + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + logits_shape(std::move(other.logits_shape)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, const std::vector &shape) { + Opaque opaque(handle_ptr, shape); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, const void *target, + infiniDtype_t dtype) const { + if (!workspace || !loss || !logits || !target) { + return INFINI_STATUS_BAD_PARAM; + } + + if (workspace_size < this->workspace_size) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + float *workspace_ptr = static_cast(workspace); + const int64_t *target_ptr = static_cast(target); + + switch (dtype) { + case INFINI_DTYPE_F32: { + const float *logits_ptr = static_cast(logits); + float *loss_ptr = static_cast(loss); + cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr); + break; + } + + case INFINI_DTYPE_F16: { + const fp16_t *logits_ptr = static_cast(logits); + fp16_t *loss_ptr = static_cast(loss); + + // F16特殊处理:使用float计算 + float temp_loss; + cross_entropy_f16_as_float(workspace_ptr, &temp_loss, logits_ptr, target_ptr); + *loss_ptr = utils::cast(temp_loss); + break; + } + + case INFINI_DTYPE_BF16: { + const bf16_t *logits_ptr = static_cast(logits); + bf16_t *loss_ptr = static_cast(loss); + cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } + + size_t get_workspace_size() const { + return workspace_size; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t /*loss_desc*/, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t /*target_desc*/) { + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + const auto &orig_shape = logits_desc->shape(); + std::vector logits_shape; + + if (orig_shape.size() == 1) { + logits_shape = {1, orig_shape[0]}; + } else { + logits_shape = orig_shape; + } + + if (logits_shape.size() < 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + auto opaque_result = Opaque::create(handle, logits_shape); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, opaque->get_workspace_size(), opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, + const void *target, void *stream) const { + return _opaque->calculate(workspace, workspace_size, loss, logits, target, _dtype); +} + +} // namespace op::cross_entropy_loss::cpu diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h new file mode 100644 index 000000000..8afec63d0 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_LOSS_CPU_H__ +#define __CROSS_ENTROPY_LOSS_CPU_H__ + +#include "../cross_entropy_loss.h" + +DESCRIPTOR(cpu) + +#endif // __CROSS_ENTROPY_LOSS_CPU_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h new file mode 100644 index 000000000..dad108d78 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h @@ -0,0 +1,48 @@ +#ifndef __CROSS_ENTROPY_LOSS_H__ +#define __CROSS_ENTROPY_LOSS_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::cross_entropy_loss::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t loss_desc, \ + infiniopTensorDescriptor_t logits_desc, \ + infiniopTensorDescriptor_t target_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *loss, \ + const void *logits, \ + const void *target, \ + void *stream) const; \ + }; \ + } + +#endif // __CROSS_ENTROPY_LOSS_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh new file mode 100644 index 000000000..5279011ef --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh @@ -0,0 +1,67 @@ +#ifndef __CROSS_ENTROPY_KERNEL_CUH__ +#define __CROSS_ENTROPY_KERNEL_CUH__ + +#include +#include +#include + +__device__ __forceinline__ float to_float(float val) { return val; } + +__device__ __forceinline__ float to_float(half val) { + return __half2float(val); +} + +__device__ __forceinline__ float to_float(__hpcc_bfloat16 val) { + return __bfloat162float(val); +} + +template +__global__ void cross_entropy_loss_kernel(T_out *loss, const T_in *logits, + const int64_t *target, int N, int C, + long long inner_size, + int64_t ignore_index) { + + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; + long long total = (long long)N * inner_size; + if (idx >= total) { + return; + } + + int n = (int)(idx / inner_size); + int inner = (int)(idx % inner_size); + + int64_t t = target[idx]; + + if (t == ignore_index) { + loss[idx] = (T_out)0.0f; + return; + } + if (t < 0 || t >= C) { + loss[idx] = (T_out)0.0f; + return; + } + + const long long base_offset = ((long long)n * C * inner_size) + inner; + + // 1. 找到 logits 中的最大值 + float max_val = -HUGE_VALF; // 使用浮点数的最大负值 + for (int c = 0; c < C; ++c) { + long long offset = base_offset + (long long)c * inner_size; + max_val = fmaxf(max_val, to_float(logits[offset])); + } + + // 2. 计算 sum(exp(x - max_val)) + float sum_exp = 0.0f; + for (int c = 0; c < C; ++c) { + long long offset = base_offset + (long long)c * inner_size; + sum_exp += expf(to_float(logits[offset]) - max_val); + } + + // 3. 计算最终 loss + long long target_offset = base_offset + (long long)t * inner_size; + float logit_tgt = to_float(logits[target_offset]); + + loss[idx] = (T_out)(logf(sum_exp) + max_val - logit_tgt); +} + +#endif // __CROSS_ENTROPY_KERNEL_CUH__ diff --git a/src/infiniop/ops/cross_entropy_loss/info.h b/src/infiniop/ops/cross_entropy_loss/info.h new file mode 100644 index 000000000..5278bf912 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/info.h @@ -0,0 +1,36 @@ +#ifndef __CROSS_ENTROPY_LOSS_INFO_H__ +#define __CROSS_ENTROPY_LOSS_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::cross_entropy_loss { + +class CrossEntropyInfo { +public: + CrossEntropyInfo() = default; + size_t batch = 0; + size_t num_classes = 0; + infiniDtype_t dtype; + + static utils::Result create( + infiniopTensorDescriptor_t loss, + infiniopTensorDescriptor_t logits, + infiniopTensorDescriptor_t target) { + + if (logits->ndim() != 2 || loss->ndim() != 1 || target->ndim() != 1) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + CrossEntropyInfo info; + info.batch = logits->dim(0); + info.num_classes = logits->dim(1); + info.dtype = logits->dtype(); + return utils::Result(std::move(info)); + } +}; + +} // namespace op::cross_entropy_loss + +#endif // __CROSS_ENTROPY_LOSS_INFO_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h new file mode 100644 index 000000000..382d555e0 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_METAX_H__ +#define __CROSS_ENTROPY_METAX_H__ + +#include "../cross_entropy_loss.h" + +DESCRIPTOR(metax) + +#endif // __CROSS_ENTROPY_METAX_H__ diff --git a/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca new file mode 100644 index 000000000..94f611e7a --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca @@ -0,0 +1,145 @@ +#include +#include +#include +#include +#include +#include +#include +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "cross_entropy_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cross_entropy_loss::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + std::vector logits_shape; + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + ~Opaque() = default; +}; + +Descriptor::~Descriptor() { + if (_opaque) delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t /*loss_desc*/, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t /*target_desc*/) { + + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + if (dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_BF16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto opaque = new Opaque(handle->internal()); + const auto &orig_shape = logits_desc->shape(); + + if (orig_shape.size() == 1) { + opaque->logits_shape = {1, orig_shape[0]}; + } else { + opaque->logits_shape = orig_shape; + } + + if (opaque->logits_shape.size() < 2) return INFINI_STATUS_BAD_TENSOR_SHAPE; + + const auto &s = opaque->logits_shape; + long long N = (long long)s[0]; + long long inner = 1; + for (size_t i = 2; i < s.size(); ++i) inner *= (long long)s[i]; + + size_t workspace_size = (size_t)(N * inner) * sizeof(float); + *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, void *loss, + const void *logits, const void *target, void *stream_) const { + + const auto &shape = _opaque->logits_shape; + int N = (int)shape[0]; + int C = (int)shape[1]; + long long inner_size = 1; + for (size_t i = 2; i < shape.size(); ++i) + inner_size *= shape[i]; + + long long total = (long long)N * inner_size; + + size_t need_ws = (size_t)total * sizeof(float); + if (workspace_size < need_ws) return INFINI_STATUS_INTERNAL_ERROR; + float* per_sample_loss = reinterpret_cast(workspace); + + const int64_t *typed_target = reinterpret_cast(target); + const int64_t ignore_index = -100; + hcStream_t stream = (hcStream_t)stream_; + + dim3 blockSize(256); + dim3 gridSize((total + blockSize.x - 1) / blockSize.x); + + if (_dtype == INFINI_DTYPE_F32) { + cross_entropy_loss_kernel + <<>>( + per_sample_loss, (const float*)logits, typed_target, + N, C, inner_size, ignore_index); + } else if (_dtype == INFINI_DTYPE_F16) { + cross_entropy_loss_kernel + <<>>( + per_sample_loss, (const half*)logits, typed_target, + N, C, inner_size, ignore_index); + } else if (_dtype == INFINI_DTYPE_BF16) { + cross_entropy_loss_kernel<__hpcc_bfloat16, float> + <<>>( + per_sample_loss, (const __hpcc_bfloat16*)logits, typed_target, + N, C, inner_size, ignore_index); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + if (hcGetLastError() != hcSuccess) return INFINI_STATUS_INTERNAL_ERROR; + + std::vector h_loss((size_t)total); + std::vector h_target((size_t)total); + if (hcMemcpyAsync(h_loss.data(), per_sample_loss, need_ws, hcMemcpyDeviceToHost, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + if (hcMemcpyAsync(h_target.data(), typed_target, (size_t)total * sizeof(int64_t), hcMemcpyDeviceToHost, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + if (hcStreamSynchronize(stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + + double acc = 0.0; + long long count = 0; + for (long long i = 0; i < total; ++i) { + if (h_target[i] != ignore_index) { + acc += (double)h_loss[i]; + count++; + } + } + double mean = (count > 0) ? (acc / (double)count) : 0.0; + + if (_dtype == INFINI_DTYPE_F32) { + float v = (float)mean; + if (hcMemcpyAsync(loss, &v, sizeof(float), hcMemcpyHostToDevice, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + } else if (_dtype == INFINI_DTYPE_F16) { + half v = __float2half((float)mean); + if (hcMemcpyAsync(loss, &v, sizeof(half), hcMemcpyHostToDevice, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + } else if (_dtype == INFINI_DTYPE_BF16) { + __hpcc_bfloat16 v = __float2bfloat16_rn((float)mean); + if (hcMemcpyAsync(loss, &v, sizeof(__hpcc_bfloat16), hcMemcpyHostToDevice, stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + } + if (hcStreamSynchronize(stream) != hcSuccess) + return INFINI_STATUS_INTERNAL_ERROR; + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cross_entropy_loss::metax diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu new file mode 100644 index 000000000..3d795a67a --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu @@ -0,0 +1,217 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "cross_entropy_loss_nvidia.cuh" + +namespace op::cross_entropy_loss::nvidia { +namespace cuda { + +__device__ __forceinline__ float to_float(float v) { return v; } +__device__ __forceinline__ float to_float(double v) { return (float)v; } +__device__ __forceinline__ float to_float(half v) { return __half2float(v); } +__device__ __forceinline__ float to_float(__nv_bfloat16 v) { + return __bfloat162float(v); +} + +template +__global__ void +softmaxCrossEntropy_per_sample(T_out *__restrict__ loss, + const T_in *__restrict__ logits, + const int64_t *__restrict__ target, int N, int C, + long long inner_size, int64_t ignore_index) { + long long total = (long long)N * inner_size; + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= total) { + return; + } + + int n = (int)(idx / inner_size); + int inr = (int)(idx % inner_size); + + int64_t t = target[(long long)n * inner_size + inr]; + if (ignore_index != LLONG_MIN && t == ignore_index) { + loss[idx] = (T_out)0; + return; + } + if (t < 0 || t >= C) { + loss[idx] = (T_out)0; + return; + } + + const long long base = ((long long)n * C * inner_size) + inr; + + // 数值稳定 LSE:lse = log(sum exp(x - m)) + m + float m = -CUDART_INF_F; + for (int c = 0; c < C; ++c) { + m = fmaxf(m, to_float(logits[base + (long long)c * inner_size])); + } + + float sum_exp = 0.f; + for (int c = 0; c < C; ++c) { + sum_exp += expf(to_float(logits[base + (long long)c * inner_size]) - m); + } + + float lse = logf(sum_exp) + m; + float logit_t = to_float(logits[base + (long long)(int)t * inner_size]); + loss[idx] = (T_out)(lse - logit_t); +} + +} // namespace cuda + +struct Descriptor::Opaque { + std::shared_ptr internal; + std::vector logits_shape; + Opaque(std::shared_ptr p) : internal(p) {} + ~Opaque() = default; +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t /*loss_desc*/, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t /*target_desc*/) { +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) + auto handle = reinterpret_cast(handle_); + auto dtype = logits_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + const auto &orig = logits_desc->shape(); + auto opaque = new Opaque(handle->internal()); + + if (orig.size() == 1) { + opaque->logits_shape = {1, orig[0]}; + } else { + opaque->logits_shape = orig; + } + + const auto &s = opaque->logits_shape; + long long N = (long long)s[0]; + long long inner = 1; + for (size_t i = 2; i < s.size(); ++i) { + inner *= (long long)s[i]; + } + + size_t workspace_size = (size_t)(N * inner) * sizeof(float); + *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device, + handle->device_id); + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *loss, const void *logits, + const void *target, void *stream) const { +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) + const auto &s = _opaque->logits_shape; + int N = (int)s[0]; + int C = (int)s[1]; + long long inner = 1; + for (size_t i = 2; i < s.size(); ++i) { + inner *= (long long)s[i]; + } + long long total = (long long)N * inner; + + size_t need_ws = (size_t)total * sizeof(float); + if (workspace_size < need_ws) { + return INFINI_STATUS_INTERNAL_ERROR; + } + float *per_sample = reinterpret_cast(workspace); + + const int64_t *tgt_i64 = reinterpret_cast(target); + const int64_t ignore_index = -100; + + // 1) 写 per-sample loss -> workspace(float) + dim3 block(256); + dim3 grid((total + block.x - 1) / block.x); + cudaStream_t st = (cudaStream_t)stream; + + if (_dtype == INFINI_DTYPE_F32) { + cuda::softmaxCrossEntropy_per_sample<<>>( + per_sample, (const float *)logits, tgt_i64, N, C, inner, ignore_index); + } else if (_dtype == INFINI_DTYPE_F16) { + cuda::softmaxCrossEntropy_per_sample<<>>( + per_sample, (const half *)logits, tgt_i64, N, C, inner, ignore_index); + } else if (_dtype == INFINI_DTYPE_BF16) { + cuda::softmaxCrossEntropy_per_sample<__nv_bfloat16, float> + <<>>(per_sample, (const __nv_bfloat16 *)logits, + tgt_i64, N, C, inner, ignore_index); + } + { + auto err = cudaGetLastError(); + if (err != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } + + // 2) host 侧 mean(仅统计 target != ignore_index) + std::vector h_loss((size_t)total); + std::vector h_tgt((size_t)total); + if (cudaMemcpyAsync(h_loss.data(), per_sample, need_ws, + cudaMemcpyDeviceToHost, st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + if (cudaMemcpyAsync(h_tgt.data(), tgt_i64, (size_t)total * sizeof(int64_t), + cudaMemcpyDeviceToHost, st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + if (cudaStreamSynchronize(st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + double acc = 0.0; + long long cnt = 0; + for (long long i = 0; i < total; ++i) { + if (h_tgt[i] != ignore_index) { + acc += (double)h_loss[i]; + ++cnt; + } + } + double mean = (cnt > 0) ? (acc / (double)cnt) : 0.0; + + // 3) 把标量 mean 写回 device 的 loss 指针(按输入 dtype 写 1 个元素) + if (_dtype == INFINI_DTYPE_F32) { + float v = (float)mean; + if (cudaMemcpyAsync(loss, &v, sizeof(float), cudaMemcpyHostToDevice, st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } else if (_dtype == INFINI_DTYPE_F16) { + half v = __float2half((float)mean); + if (cudaMemcpyAsync(loss, &v, sizeof(half), cudaMemcpyHostToDevice, st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } else if (_dtype == INFINI_DTYPE_BF16) { + __nv_bfloat16 v = __float2bfloat16((float)mean); + if (cudaMemcpyAsync(loss, &v, sizeof(__nv_bfloat16), cudaMemcpyHostToDevice, + st) + != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + } + if (cudaStreamSynchronize(st) != cudaSuccess) { + return INFINI_STATUS_INTERNAL_ERROR; + } + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} +} // namespace op::cross_entropy_loss::nvidia diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh new file mode 100644 index 000000000..843fc943d --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_LOSS_CUDA_CUH__ +#define __CROSS_ENTROPY_LOSS_CUDA_CUH__ + +#include "../cross_entropy_loss.h" + +DESCRIPTOR(nvidia) + +#endif // __CROSS_ENTROPY_LOSS_CUDA_CUH__ diff --git a/src/infiniop/ops/cross_entropy_loss/operator.cc b/src/infiniop/ops/cross_entropy_loss/operator.cc new file mode 100644 index 000000000..e9a47558f --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss/operator.cc @@ -0,0 +1,143 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cross_entropy_loss.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cross_entropy_loss_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cross_entropy_loss_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cross_entropy_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyLossDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t loss_desc, + infiniopTensorDescriptor_t logits_desc, + infiniopTensorDescriptor_t target_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cross_entropy_loss::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast< \ + op::cross_entropy_loss::NAMESPACE::Descriptor **>(desc_ptr), \ + loss_desc, logits_desc, target_desc) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize( + infiniopCrossEntropyLossDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast< \ + const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \ + ->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopCrossEntropyLoss( + infiniopCrossEntropyLossDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *loss, + const void *logits, + const void *target, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast< \ + const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \ + ->calculate(workspace, workspace_size, loss, logits, target, \ + stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor( + infiniopCrossEntropyLossDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast< \ + const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc new file mode 100644 index 000000000..6b5edef36 --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.cc @@ -0,0 +1,54 @@ +#include "div_cpu.h" + +namespace op::div::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::cpu diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h new file mode 100644 index 000000000..0373b766f --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.h @@ -0,0 +1,19 @@ +#ifndef __DIV_CPU_H__ +#define __DIV_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(div, cpu) + +namespace op::div::cpu { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a / b; + } +} DivOp; +} // namespace op::div::cpu + +#endif // __DIV_CPU_H__ diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh new file mode 100644 index 000000000..cefbf0141 --- /dev/null +++ b/src/infiniop/ops/div/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __DIV_CUDA_H__ +#define __DIV_CUDA_H__ + +namespace op::div::cuda { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __h2div(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hdiv(a, b); + } else if constexpr (std::is_same_v) { + return __fdiv_rn(a, b); + } else { + return a / b; + } + } +} DivOp; +} // namespace op::div::cuda + +#endif // __DIV_CUDA_H__ diff --git a/src/infiniop/ops/div/metax/div_metax.h b/src/infiniop/ops/div/metax/div_metax.h new file mode 100644 index 000000000..1e56a7d44 --- /dev/null +++ b/src/infiniop/ops/div/metax/div_metax.h @@ -0,0 +1,8 @@ +#ifndef __DIV_METAX_API_H__ +#define __DIV_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(div, metax) + +#endif // __DIV_METAX_API_H__ diff --git a/src/infiniop/ops/div/metax/div_metax.maca b/src/infiniop/ops/div/metax/div_metax.maca new file mode 100644 index 000000000..a8ecd8643 --- /dev/null +++ b/src/infiniop/ops/div/metax/div_metax.maca @@ -0,0 +1,62 @@ +#include "div_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::div::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::DivOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::DivOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::metax diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu new file mode 100644 index 000000000..4cb64af63 --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "div_nvidia.cuh" + +namespace op::div::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::DivOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::DivOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::nvidia diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh new file mode 100644 index 000000000..1ad8af94e --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __DIV_CUDA_API_H__ +#define __DIV_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(div, nvidia) + +#endif // __DIV_CUDA_API_H__ diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc new file mode 100644 index 000000000..162156887 --- /dev/null +++ b/src/infiniop/ops/div/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/div.h" + +#ifdef ENABLE_CPU_API +#include "cpu/div_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/div_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/div_metax.h" +#endif + +__C infiniStatus_t infiniopCreateDivDescriptor( + infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::div::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDiv( + infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc new file mode 100644 index 000000000..66555cdd5 --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc @@ -0,0 +1,82 @@ +#include "equal_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::equal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = c_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL); + CHECK_OR_RETURN(b_desc->dtype() == a_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = EqualInfo::createEqualInfo( + c_desc, + a_desc, + b_desc + ); + CHECK_RESULT(result); + const EqualInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream +) const { + std::vector contiguous_strides(_info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for(size_t d = 0; d < _info.ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = _info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + size_t elem_size = infiniSizeOf(_info.dtype); + auto c_ptr = reinterpret_cast(c); + *c_ptr = true; + #pragma omp parallel for + for(size_t i = 0; i < total_size; i ++) { + auto a_ptr = reinterpret_cast(a); + auto b_ptr = reinterpret_cast(b); + size_t rem = i; + for(int d = _info.ndim - 1; d >= 0; d --) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + a_ptr += dim_index * _info.a_strides[d]; + b_ptr += dim_index * _info.b_strides[d]; + } + if (memcmp(a_ptr, b_ptr, elem_size) != 0) { + *c_ptr = false; + } + } + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h new file mode 100644 index 000000000..a09c63d9b --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.h @@ -0,0 +1,9 @@ +#ifndef __EQUAL_CPU_H__ +#define __EQUAL_CPU_H__ + +#include "../equal.h" + +DESCRIPTOR(cpu) + + +#endif // __EQUAL_CPU_H__ diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh new file mode 100644 index 000000000..193c94333 --- /dev/null +++ b/src/infiniop/ops/equal/cuda/kernel.cuh @@ -0,0 +1,38 @@ +#ifndef __EQUAL_KERNEL_CUH__ +#define __EQUAL_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void equalKernel( + bool * c, + const Tdata * a, + const Tdata * b, + size_t ndim, + size_t total_size, + ptrdiff_t* contiguous_strides, + ptrdiff_t* a_strides, + ptrdiff_t* b_strides +) { + if (threadIdx.x == 0) + { + *c = true; + } + __syncthreads(); + for(size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) { + auto a_ptr = a; + auto b_ptr = b; + size_t rem = i; + for(int d = ndim - 1; d >= 0; d --) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + a_ptr += dim_index * a_strides[d]; + b_ptr += dim_index * b_strides[d]; + } + if ((*a_ptr != *b_ptr) && (*c == true)) { + *c = false; + } + + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __EQUAL_KERNEL_CUH__ diff --git a/src/infiniop/ops/equal/equal.h b/src/infiniop/ops/equal/equal.h new file mode 100644 index 000000000..12cc0ba16 --- /dev/null +++ b/src/infiniop/ops/equal/equal.h @@ -0,0 +1,48 @@ +#ifndef __EQUAL_H__ +#define __EQUAL_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::equal::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + EqualInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + EqualInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t c_desc, \ + infiniopTensorDescriptor_t a_desc, \ + infiniopTensorDescriptor_t b_desc \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * c, \ + const void * a, \ + const void * b, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/equal/info.h b/src/infiniop/ops/equal/info.h new file mode 100644 index 000000000..5dd2c0a54 --- /dev/null +++ b/src/infiniop/ops/equal/info.h @@ -0,0 +1,46 @@ +#ifndef __EQUAL_INFO_H__ +#define __EQUAL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::equal { + +class EqualInfo { +private: + EqualInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + size_t ndim; + infiniDtype_t dtype; + std::vector a_shape; + std::vector a_strides; + std::vector b_strides; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createEqualInfo( + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN(c_desc->ndim() == 1 && c_desc->dim(0) == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_SAME_SHAPE(a_desc->shape(), b_desc->shape()); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(EqualInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + a_desc->ndim(), + a_desc->dtype(), + a_desc->shape(), + a_desc->strides(), + b_desc->strides() +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __EQUAL_INFO_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h new file mode 100644 index 000000000..0ebb67bb3 --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __EQUAL_METAX_H__ +#define __EQUAL_METAX_H__ + +#include "../equal.h" + +DESCRIPTOR(metax) + +#endif // __EQUAL_METAX_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca new file mode 100644 index 000000000..c8f4dda7d --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.maca @@ -0,0 +1,162 @@ +#include "../../../devices/metax/metax_common.h" +#include "equal_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::equal::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + bool * c, + const Tdata * a, + const Tdata * b, + size_t ndim, + size_t total_size, + ptrdiff_t* contiguous_strides, + ptrdiff_t* a_strides, + ptrdiff_t* b_strides +) { + equalKernel( + c, + a, + b, + ndim, + total_size, + contiguous_strides, + a_strides, + b_strides + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_equal( + const EqualInfo &info, + bool * c, + const Tdata * a, + const Tdata * b, + hcStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + for(size_t d = 0; d < ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + + launchKernel<<<1, BLOCK_SIZE, 0, stream>>>( + c, + a, + b, + info.ndim, + total_size, + contiguous_strides_cuda, + a_strides_cuda, + b_strides_cuda + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = a_desc->dtype(); + auto result = EqualInfo::createEqualInfo( + c_desc, + a_desc, + b_desc + ); + CHECK_RESULT(result); + const EqualInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3;; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_EQUAL(TDATA) \ + calculate_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + switch (_info.dtype) { + case INFINI_DTYPE_U8: + return CALCULATE_EQUAL(uint8_t); + case INFINI_DTYPE_U16: + return CALCULATE_EQUAL(uint16_t); + case INFINI_DTYPE_U32: + return CALCULATE_EQUAL(uint32_t); + case INFINI_DTYPE_U64: + return CALCULATE_EQUAL(uint64_t); + case INFINI_DTYPE_I8: + return CALCULATE_EQUAL(int8_t); + case INFINI_DTYPE_I16: + return CALCULATE_EQUAL(int16_t); + case INFINI_DTYPE_I32: + return CALCULATE_EQUAL(int32_t); + case INFINI_DTYPE_I64: + return CALCULATE_EQUAL(int64_t); + case INFINI_DTYPE_F16: + return CALCULATE_EQUAL(half); + case INFINI_DTYPE_F32: + return CALCULATE_EQUAL(float); + case INFINI_DTYPE_BF16: + return CALCULATE_EQUAL(cuda_bfloat16); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_EQUAL +} +} // namespace op::equal::metax diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu new file mode 100644 index 000000000..d1bfab8f0 --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu @@ -0,0 +1,163 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "equal_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::equal::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + bool * c, + const Tdata * a, + const Tdata * b, + size_t ndim, + size_t total_size, + ptrdiff_t* contiguous_strides, + ptrdiff_t* a_strides, + ptrdiff_t* b_strides +) { + equalKernel( + c, + a, + b, + ndim, + total_size, + contiguous_strides, + a_strides, + b_strides + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_equal( + const EqualInfo &info, + bool * c, + const Tdata * a, + const Tdata * b, + cudaStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + for(size_t d = 0; d < ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<<<1, BLOCK_SIZE, 0, stream>>>( + c, + a, + b, + info.ndim, + total_size, + contiguous_strides_cuda, + a_strides_cuda, + b_strides_cuda + ); + + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = a_desc->dtype(); + auto result = EqualInfo::createEqualInfo( + c_desc, + a_desc, + b_desc + ); + CHECK_RESULT(result); + const EqualInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3; +// ---------------------- end: check data type and calculate workspace size ----------------------- + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream_ +) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_EQUAL(TDATA) \ + calculate_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + switch (_info.dtype) { + case INFINI_DTYPE_U8: + return CALCULATE_EQUAL(uint8_t); + case INFINI_DTYPE_U16: + return CALCULATE_EQUAL(uint16_t); + case INFINI_DTYPE_U32: + return CALCULATE_EQUAL(uint32_t); + case INFINI_DTYPE_U64: + return CALCULATE_EQUAL(uint64_t); + case INFINI_DTYPE_I8: + return CALCULATE_EQUAL(int8_t); + case INFINI_DTYPE_I16: + return CALCULATE_EQUAL(int16_t); + case INFINI_DTYPE_I32: + return CALCULATE_EQUAL(int32_t); + case INFINI_DTYPE_I64: + return CALCULATE_EQUAL(int64_t); + case INFINI_DTYPE_F16: + return CALCULATE_EQUAL(half); + case INFINI_DTYPE_F32: + return CALCULATE_EQUAL(float); + case INFINI_DTYPE_BF16: + return CALCULATE_EQUAL(cuda_bfloat16); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_EQUAL +} +} // namespace op::equal::nvidia diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh new file mode 100644 index 000000000..11760c91d --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __EQUAL_NVIDIA_API_H__ +#define __EQUAL_NVIDIA_API_H__ +#include "../equal.h" + +DESCRIPTOR(nvidia) + +#endif // __EQUAL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc new file mode 100644 index 000000000..81607fef8 --- /dev/null +++ b/src/infiniop/ops/equal/operator.cc @@ -0,0 +1,152 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/equal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/equal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/equal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/equal_metax.h" +#endif + +__C infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + a_desc, \ + b_desc \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + c, \ + a, \ + b, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc new file mode 100644 index 000000000..58a6d0f2d --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc @@ -0,0 +1,52 @@ +#include "exp_cpu.h" + +namespace op::exp::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::cpu diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h new file mode 100644 index 000000000..867c7afa5 --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.h @@ -0,0 +1,21 @@ +#ifndef __EXP_CPU_H__ +#define __EXP_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(exp, cpu) + +namespace op::exp::cpu { +typedef struct ExpOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::exp(input); + } +} ExpOp; +} // namespace op::exp::cpu + +#endif // __EXP_CPU_H__ diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh new file mode 100644 index 000000000..12446f31a --- /dev/null +++ b/src/infiniop/ops/exp/cuda/kernel.cuh @@ -0,0 +1,39 @@ +#ifndef __EXP_CUDA_H__ +#define __EXP_CUDA_H__ + +#include +#include +#include + +namespace op::exp::cuda { +typedef struct ExpOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__expf(vf.x), __expf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__expf(f0), __expf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + return __expf(input); + } else if constexpr (std::is_same_v) { + return std::exp(input); + } else { + return std::exp(input); + } + } +} ExpOp; +} // namespace op::exp::cuda + +#endif // __EXP_CUDA_H__ diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h new file mode 100644 index 000000000..fb10faf9b --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.h @@ -0,0 +1,8 @@ +#ifndef __EXP_METAX_API_H__ +#define __EXP_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(exp, metax) + +#endif // __EXP_METAX_API_H__ diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca new file mode 100644 index 000000000..c71703c6d --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.maca @@ -0,0 +1,60 @@ +#include "exp_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::exp::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::metax diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu new file mode 100644 index 000000000..3bdf2eb45 --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "exp_nvidia.cuh" + +namespace op::exp::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::nvidia diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh new file mode 100644 index 000000000..7545e8f3e --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EXP_CUDA_API_H__ +#define __EXP_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(exp, nvidia) + +#endif // __EXP_CUDA_API_H__ diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc new file mode 100644 index 000000000..ee1dc6768 --- /dev/null +++ b/src/infiniop/ops/exp/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/exp.h" + +#ifdef ENABLE_CPU_API +#include "cpu/exp_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/exp_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/exp_metax.h" +#endif + +__C infiniStatus_t infiniopCreateExpDescriptor( + infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::exp::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopExp( + infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.cc b/src/infiniop/ops/gather/cpu/gather_cpu.cc new file mode 100644 index 000000000..cc1d5b740 --- /dev/null +++ b/src/infiniop/ops/gather/cpu/gather_cpu.cc @@ -0,0 +1,106 @@ +#include "gather_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::gather::cpu { + +infiniStatus_t calculate_gather( + const GatherInfo &info, + char * output, + const char * input, + const int64_t * index +) { +// -------------------------------- start: perform operator on CPU -------------------------------- + std::vector contiguous_strides(info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for(size_t d = 0; d < info.ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.output_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + int gather_dim = info.dim; + size_t element_size = infiniSizeOf(info.dtype); + + #pragma omp parallel for + for(size_t i = 0; i < total_size; i++) + { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = i; + for (int d = info.ndim - 1; d >= 0; d--) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * element_size * info.output_strides[d]; + index_ptr += dim_index * info.index_strides[d]; + if(d != gather_dim) + input_ptr += dim_index * element_size * info.input_strides[d]; + } + int64_t gather_number = *index_ptr; + input_ptr += gather_number * element_size * info.input_strides[gather_dim]; + // *output_ptr = *input_ptr; + memcpy( + output_ptr, + input_ptr, + element_size + ); + } +// --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = GatherInfo::createGatherInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const GatherInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) const { + + return calculate_gather(_info, (char *)output, (const char *)input, (const int64_t *)index); +} +} + diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.h b/src/infiniop/ops/gather/cpu/gather_cpu.h new file mode 100644 index 000000000..bc74fd669 --- /dev/null +++ b/src/infiniop/ops/gather/cpu/gather_cpu.h @@ -0,0 +1,9 @@ +#ifndef __GATHER_CPU_H__ +#define __GATHER_CPU_H__ + +#include "../gather.h" + +DESCRIPTOR(cpu) + + +#endif // __GATHER_CPU_H__ diff --git a/src/infiniop/ops/gather/cuda/kernel.cuh b/src/infiniop/ops/gather/cuda/kernel.cuh new file mode 100644 index 000000000..e49ca0fd2 --- /dev/null +++ b/src/infiniop/ops/gather/cuda/kernel.cuh @@ -0,0 +1,38 @@ +#ifndef __GATHER_KERNEL_CUH__ +#define __GATHER_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void gatherKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_gather_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int gather_dim +) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = blockIdx.x; + for (int d = ndim - 1; d >= 0; d--) { + if (d == gather_dim) + continue; + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * output_strides[d]; + input_ptr += dim_index * input_strides[d]; + index_ptr += dim_index * index_strides[d]; + } + for (size_t c = threadIdx.x; c < index_gather_size; c ++) { + int64_t gather_number = *(index_ptr + c * index_strides[gather_dim]); + *(output_ptr + c * output_strides[gather_dim]) = \ + *(input_ptr + gather_number * input_strides[gather_dim]); + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __GATHER_KERNEL_CUH__ diff --git a/src/infiniop/ops/gather/gather.h b/src/infiniop/ops/gather/gather.h new file mode 100644 index 000000000..70991f5db --- /dev/null +++ b/src/infiniop/ops/gather/gather.h @@ -0,0 +1,49 @@ +#ifndef __GATHER_H__ +#define __GATHER_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::gather::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + GatherInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + GatherInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + const void * input, \ + const void * index, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/gather/info.h b/src/infiniop/ops/gather/info.h new file mode 100644 index 000000000..aa7a54a16 --- /dev/null +++ b/src/infiniop/ops/gather/info.h @@ -0,0 +1,58 @@ +#ifndef __GATHER_INFO_H__ +#define __GATHER_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::gather { + +class GatherInfo { +private: + GatherInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t ndim; + std::vector output_shape; + size_t input_dim_size; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + size_t dim; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createGatherInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_SAME_SHAPE(output_desc->shape(), index_desc->shape()); + size_t ndim = output_desc->ndim(); + for (size_t d = 0; d < ndim; d ++) { + if (d != dim) + CHECK_OR_RETURN(input_desc->dim(d) == output_desc->dim(d), INFINI_STATUS_BAD_TENSOR_SHAPE); + } + CHECK_OR_RETURN(ndim > dim, INFINI_STATUS_BAD_PARAM); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(GatherInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + ndim, + output_desc->shape(), + input_desc->dim(dim), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + dim +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __GATHER_INFO_H__ diff --git a/src/infiniop/ops/gather/metax/gather_metax.h b/src/infiniop/ops/gather/metax/gather_metax.h new file mode 100644 index 000000000..bf0a25a36 --- /dev/null +++ b/src/infiniop/ops/gather/metax/gather_metax.h @@ -0,0 +1,8 @@ +#ifndef __GATHER_METAX_H__ +#define __GATHER_METAX_H__ + +#include "../gather.h" + +DESCRIPTOR(metax) + +#endif // __GATHER_METAX_H__ diff --git a/src/infiniop/ops/gather/metax/gather_metax.maca b/src/infiniop/ops/gather/metax/gather_metax.maca new file mode 100644 index 000000000..7254078c3 --- /dev/null +++ b/src/infiniop/ops/gather/metax/gather_metax.maca @@ -0,0 +1,190 @@ +#include "../../../devices/metax/metax_common.h" +#include "gather_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::gather::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_gather_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int gather_dim +) { + gatherKernel( + output, + input, + index, + ndim, + index_gather_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + gather_dim + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_gather( + const GatherInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + hcStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t gather_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == gather_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.output_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t batch_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + + + launchKernel<<>>( + output, + input, + index, + ndim, + info.output_shape[gather_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + info.dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + auto result = GatherInfo::createGatherInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const GatherInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + + #define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \ + calculate_gather(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + + #define CALCULATE_GATHER_WITH_METAX_BLOCK(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_GATHER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_GATHER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_GATHER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_GATHER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_GATHER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_GATHER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_GATHER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_GATHER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_GATHER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_GATHER_WITH_METAX_BLOCK + #undef CALCULATE_GATHER +} +} // namespace op::gather::metax diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cu b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu new file mode 100644 index 000000000..94741a7d5 --- /dev/null +++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu @@ -0,0 +1,189 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "gather_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::gather::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_gather_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int gather_dim +) { + gatherKernel( + output, + input, + index, + ndim, + index_gather_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + gather_dim + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_gather( + const GatherInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + cudaStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t gather_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == gather_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.output_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t batch_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<1, Tdata><<>>( + output, + input, + index, + ndim, + info.output_shape[gather_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + info.dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = GatherInfo::createGatherInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const GatherInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \ + calculate_gather(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + #define CALCULATE_GATHER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_GATHER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_GATHER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_GATHER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_GATHER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_GATHER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_GATHER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_GATHER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + #undef CALCULATE_GATHER_WITH_BLOCK_SIZE + #undef CALCULATE_GATHER + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gather::nvidia diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh new file mode 100644 index 000000000..46d42fa0c --- /dev/null +++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __GATHER_NVIDIA_API_H__ +#define __GATHER_NVIDIA_API_H__ +#include "../gather.h" + +DESCRIPTOR(nvidia) + +#endif // __GATHER_NVIDIA_API_H__ diff --git a/src/infiniop/ops/gather/operator.cc b/src/infiniop/ops/gather/operator.cc new file mode 100644 index 000000000..c748f811c --- /dev/null +++ b/src/infiniop/ops/gather/operator.cc @@ -0,0 +1,154 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gather.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gather_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/gather_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/gather_metax.h" +#endif + +__C infiniStatus_t infiniopCreateGatherDescriptor( + infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gather::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGather( + infiniopGatherDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + index, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc new file mode 100644 index 000000000..e7b68508a --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc @@ -0,0 +1,52 @@ +#include "hardswish_cpu.h" + +namespace op::hardswish::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::cpu diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h new file mode 100644 index 000000000..e137be8a0 --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -0,0 +1,30 @@ +#ifndef __HARDSWISH_CPU_H__ +#define __HARDSWISH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(hardswish, cpu) + +namespace op::hardswish::cpu { +typedef struct HardswishOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + if constexpr (std::is_integral_v) { + return static_cast(0); + } else { + // x * clamp(x + 3, 0, 6) / 6 + auto x = static_cast(input); + double y = x + 3.0; + y = std::min(std::max(y, 0.0), 6.0); + double out = x * (y / 6.0); + return static_cast(out); + } + } +} HardswishOp; +} // namespace op::hardswish::cpu + +#endif // __HARDSWISH_CPU_H__ diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh new file mode 100644 index 000000000..d5b369bce --- /dev/null +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -0,0 +1,55 @@ +#ifndef __HARDSWISH_CUDA_H__ +#define __HARDSWISH_CUDA_H__ + +#include +#include +#include + +namespace op::hardswish::cuda { + +typedef struct HardswishOp { + static constexpr size_t num_inputs = 1; + + // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 + __device__ __forceinline__ float hswish_f32(float x) const { + float y = x + 3.0f; + y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y); + return x * (y * (1.0f / 6.0f)); + } + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2( + hswish_f32(vf.x), + hswish_f32(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = hswish_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + return __float2bfloat16_rz(hswish_f32(xf)); + } else if constexpr (std::is_same_v) { + return hswish_f32(input); + } else if constexpr (std::is_same_v) { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } else { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } + } +} HardswishOp; + +} // namespace op::hardswish::cuda + +#endif // __HARDSWISH_CUDA_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h new file mode 100644 index 000000000..16b131aa9 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_METAX_API_H__ +#define __HARDSWISH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, metax) + +#endif // __HARDSWISH_METAX_API_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca new file mode 100644 index 000000000..e53b94357 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca @@ -0,0 +1,60 @@ +#include "hardswish_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hardswish::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::metax diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu new file mode 100644 index 000000000..9e279c2ef --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardswish_nvidia.cuh" + +namespace op::hardswish::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::nvidia diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh new file mode 100644 index 000000000..f869ad52f --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_CUDA_API_H__ +#define __HARDSWISH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hardswish, nvidia) + +#endif // __HARDSWISH_CUDA_API_H__ diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc new file mode 100644 index 000000000..e8ba19fc1 --- /dev/null +++ b/src/infiniop/ops/hardswish/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardswish.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardswish_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hardswish_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hardswish_metax.h" +#endif + +__C infiniStatus_t infiniopCreateHardswishDescriptor( + infiniopHandle_t handle, + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardswish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopHardswish( + infiniopHardswishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc new file mode 100644 index 000000000..f45ddef7a --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc @@ -0,0 +1,108 @@ +#include "index_copy_inplace_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../../rearrange/cpu/rearrange_cpu.h" +#include "../info.h" + +namespace op::index_copy_inplace::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + + + auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const IndexCopyInplaceInfo &info = result.take(); + size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype); +// ---------------------- end: check data type and calculate workspace size ----------------------- + InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data() + ); + InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data() + ); + + void * in_rearrange_descriptor = nullptr; + void * out_rearrange_descriptor = nullptr; + + op::rearrange::cpu::Descriptor::create( + handle_, reinterpret_cast(&in_rearrange_descriptor), + rearrange_in_desc, input_desc + ); + op::rearrange::cpu::Descriptor::create( + handle_, reinterpret_cast(&out_rearrange_descriptor), + output_desc, rearrange_out_desc + ); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id, + in_rearrange_descriptor, + out_rearrange_descriptor + ); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) const { + size_t size_of_dtype = infiniSizeOf(_info.dtype); + auto index_ptr = reinterpret_cast(index); + + + char* workspace_in = reinterpret_cast(workspace); + char* workspace_out = workspace_in + size_of_dtype * _info.total_input_size; + + + reinterpret_cast(_rearrange_desc_in)->calculate(workspace_in, input, stream); + memset(workspace_out, 0, _info.total_output_size * size_of_dtype); + size_t copy_unit_size = _info.meta_strides[_info.dim] * size_of_dtype; + #pragma omp parallel for + for (size_t dst_index = 0; dst_index < _info.output_shape[_info.dim]; dst_index++) { + size_t src_index = _info.index_shape[0] - 1; + while (true) + { + if (*(index_ptr + src_index * _info.index_strides[0]) == int64_t(dst_index)) { + std::memcpy( + workspace_out + size_of_dtype * dst_index * _info.meta_strides[_info.dim], + workspace_in + size_of_dtype * src_index * _info.meta_strides[_info.dim], + copy_unit_size + ); + break; + } + else if (src_index == 0) + break; + src_index --; + } + } + reinterpret_cast(_rearrange_desc_out)->calculate(output, workspace_out, stream); + + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h new file mode 100644 index 000000000..384197013 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h @@ -0,0 +1,9 @@ +#ifndef __INDEX_COPY_INPLACE_CPU_H__ +#define __INDEX_COPY_INPLACE_CPU_H__ + +#include "../index_copy_inplace.h" + +INDEX_COPY_INPLACE_DESCRIPTOR(cpu) + + +#endif // __INDEX_COPY_INPLACE_CPU_H__ diff --git a/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h new file mode 100644 index 000000000..f99fd3116 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h @@ -0,0 +1,55 @@ +#ifndef __INDEX_COPY_INPLACE_H__ +#define __INDEX_COPY_INPLACE_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define INDEX_COPY_INPLACE_DESCRIPTOR(NAMESPACE) \ + namespace op::index_copy_inplace::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + IndexCopyInplaceInfo _info; \ + size_t _workspace_size; \ + void *_rearrange_desc_in; \ + void *_rearrange_desc_out; \ + Descriptor( \ + infiniDtype_t dtype, \ + IndexCopyInplaceInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id, \ + void *rearrange_desc_in, \ + void *rearrange_desc_out \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_), \ + _rearrange_desc_in(rearrange_desc_in), \ + _rearrange_desc_out(rearrange_desc_out) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + const void * input, \ + const void * index, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/info.h b/src/infiniop/ops/index_copy_inplace/info.h new file mode 100644 index 000000000..99d6a3a1e --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/info.h @@ -0,0 +1,76 @@ +#ifndef __INDEX_COPY_INPLACE_INFO_H__ +#define __INDEX_COPY_INPLACE_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::index_copy_inplace { + +class IndexCopyInplaceInfo { +private: + IndexCopyInplaceInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t total_input_size; + size_t total_output_size; + std::vector output_shape; + std::vector input_shape; + std::vector index_shape; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + std::vector meta_strides; + size_t dim; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createIndexCopyInplaceInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + std::vector meta_strides(input_desc->ndim()); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + size_t total_input_size = 1; + size_t total_output_size = 1; + for (size_t d = 0; d < input_desc->ndim(); d++){ + total_input_size *= input_desc->dim(d); + total_output_size *= output_desc->dim(d); + if (d == dim) { + continue; + } + else { + meta_strides[d] = last_dim * last_stride; + last_dim = input_desc->dim(d); + last_stride = meta_strides[d]; + } + } + meta_strides[dim] = last_dim * last_stride; +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(IndexCopyInplaceInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + total_input_size, + total_output_size, + output_desc->shape(), + input_desc->shape(), + index_desc->shape(), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + meta_strides, + dim +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __INDEX_COPY_INPLACE_INFO_H__ diff --git a/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h new file mode 100644 index 000000000..65673c88c --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h @@ -0,0 +1,8 @@ +#ifndef __INDEX_COPY_INPLACE_METAX_H__ +#define __INDEX_COPY_INPLACE_METAX_H__ + +#include "../index_copy_inplace.h" + +INDEX_COPY_INPLACE_DESCRIPTOR(metax) + +#endif // __INDEX_COPY_INPLACE_METAX_H__ diff --git a/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca new file mode 100644 index 000000000..e540b6201 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca @@ -0,0 +1,147 @@ +#include "../../../devices/metax/metax_common.h" +#include "index_copy_inplace_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../../rearrange/metax/rearrange_metax.h" +#include "../info.h" + +namespace op::index_copy_inplace::metax { + +infiniStatus_t calculate_index_copy_inplace( + char * output, + const char * input, + const int64_t * index, + size_t copy_unit_size, + size_t output_len, + size_t index_len, + ptrdiff_t index_stride, + hcStream_t stream +) { + int64_t* dst_index = new int64_t; + size_t sizeof_int64_t = sizeof(int64_t); + for (size_t src_index = 0; src_index < index_len; src_index ++) { + CHECK_METAX(hcMemcpyAsync( + dst_index, + index + src_index * index_stride, + sizeof_int64_t, + hcMemcpyDeviceToHost, + stream + )); + hcStreamSynchronize(stream); + CHECK_METAX(hcMemcpyAsync( + output + (size_t)(*dst_index) * copy_unit_size, + input + src_index * copy_unit_size, + copy_unit_size, + hcMemcpyDeviceToDevice, + stream + )); + hcStreamSynchronize(stream); + } + delete dst_index; + return INFINI_STATUS_SUCCESS; +} + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete reinterpret_cast(_rearrange_desc_in); + delete reinterpret_cast(_rearrange_desc_out); + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + + auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const IndexCopyInplaceInfo &info = result.take(); + size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype); + + + InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data() + ); + InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data() + ); + + void * in_rearrange_descriptor = nullptr; + void * out_rearrange_descriptor = nullptr; + + op::rearrange::metax::Descriptor::create( + handle_, reinterpret_cast(&in_rearrange_descriptor), + rearrange_in_desc, input_desc + ); + op::rearrange::metax::Descriptor::create( + handle_, reinterpret_cast(&out_rearrange_descriptor), + output_desc, rearrange_out_desc + ); + +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id, + in_rearrange_descriptor, + out_rearrange_descriptor + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + size_t elem_size = infiniSizeOf(_info.dtype); + char* workspace_in = reinterpret_cast(workspace); + char* workspace_out = workspace_in + elem_size * _info.total_input_size; + CHECK_STATUS(reinterpret_cast(_rearrange_desc_in)->calculate(workspace_in, input, stream)); + hcMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream); + + hcDeviceSynchronize(); + CHECK_STATUS(calculate_index_copy_inplace( + reinterpret_cast(workspace_out), + reinterpret_cast(workspace_in), + reinterpret_cast(index), + elem_size * _info.meta_strides[_info.dim], + _info.output_shape[_info.dim], + _info.index_shape[0], + _info.index_strides[0], + stream + )); + hcDeviceSynchronize(); + + CHECK_STATUS(reinterpret_cast(_rearrange_desc_out)->calculate(output, workspace_out, stream)); + return INFINI_STATUS_SUCCESS; + +} +} // namespace op::index_copy_inplace::metax diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu new file mode 100644 index 000000000..ebf5907e0 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu @@ -0,0 +1,140 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "index_copy_inplace_nvidia.cuh" +#include "../../rearrange/nvidia/rearrange_nvidia.cuh" +#include "../info.h" + +namespace op::index_copy_inplace::nvidia { + +infiniStatus_t calculate_index_copy_inplace( + char * output, + const char * input, + const int64_t * index, + size_t copy_unit_size, + size_t output_len, + size_t index_len, + ptrdiff_t index_stride, + cudaStream_t stream +) { + int64_t* dst_index = new int64_t; + size_t sizeof_int64_t = sizeof(int64_t); + for (size_t src_index = 0; src_index < index_len; src_index ++) { + CHECK_CUDA(cudaMemcpyAsync( + dst_index, + index + src_index * index_stride, + sizeof_int64_t, + cudaMemcpyDeviceToHost, + stream + )); + cudaStreamSynchronize(stream); + CHECK_CUDA(cudaMemcpyAsync( + output + (size_t)(*dst_index) * copy_unit_size, + input + src_index * copy_unit_size, + copy_unit_size, + cudaMemcpyDeviceToDevice, + stream + )); + cudaStreamSynchronize(stream); + } + delete dst_index; + return INFINI_STATUS_SUCCESS; +} + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete reinterpret_cast(_rearrange_desc_in); + delete reinterpret_cast(_rearrange_desc_out); + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const IndexCopyInplaceInfo &info = result.take(); + size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype); + + InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data() + ); + InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data() + ); + + void * in_rearrange_descriptor = nullptr; + void * out_rearrange_descriptor = nullptr; + + op::rearrange::nvidia::Descriptor::create( + handle_, reinterpret_cast(&in_rearrange_descriptor), + rearrange_in_desc, input_desc + ); + op::rearrange::nvidia::Descriptor::create( + handle_, reinterpret_cast(&out_rearrange_descriptor), + output_desc, rearrange_out_desc + ); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id, + in_rearrange_descriptor, + out_rearrange_descriptor + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + size_t elem_size = infiniSizeOf(_info.dtype); + char* workspace_in = reinterpret_cast(workspace); + char* workspace_out = workspace_in + elem_size * _info.total_input_size; + CHECK_STATUS(reinterpret_cast(_rearrange_desc_in)->calculate(workspace_in, input, stream)); + cudaMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream); + cudaDeviceSynchronize(); + CHECK_STATUS(calculate_index_copy_inplace( + reinterpret_cast(workspace_out), + reinterpret_cast(workspace_in), + reinterpret_cast(index), + elem_size * _info.meta_strides[_info.dim], + _info.output_shape[_info.dim], + _info.index_shape[0], + _info.index_strides[0], + stream + )); + cudaDeviceSynchronize(); + + CHECK_STATUS(reinterpret_cast(_rearrange_desc_out)->calculate(output, workspace_out, stream)); + return INFINI_STATUS_SUCCESS; +} +} // namespace op::index_copy_inplace::nvidia diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh new file mode 100644 index 000000000..04c3c86f7 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __INDEX_COPY_INPLACE_NVIDIA_API_H__ +#define __INDEX_COPY_INPLACE_NVIDIA_API_H__ +#include "../index_copy_inplace.h" + +INDEX_COPY_INPLACE_DESCRIPTOR(nvidia) + +#endif // __INDEX_COPY_INPLACE_NVIDIA_API_H__ diff --git a/src/infiniop/ops/index_copy_inplace/operator.cc b/src/infiniop/ops/index_copy_inplace/operator.cc new file mode 100644 index 000000000..e8886b65f --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/operator.cc @@ -0,0 +1,154 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/index_copy_inplace.h" + +#ifdef ENABLE_CPU_API +#include "cpu/index_copy_inplace_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/index_copy_inplace_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/index_copy_inplace_metax.h" +#endif + +__C infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( + infiniopHandle_t handle, + infiniopIndexCopyInplaceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::index_copy_inplace::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopIndexCopyInplace( + infiniopIndexCopyInplaceDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + index, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc new file mode 100644 index 000000000..508dcecc6 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc @@ -0,0 +1,284 @@ +#include "interpolate_nearest_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include + +namespace op::interpolate_nearest::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + InterpolateNearestInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const InterpolateNearestInfo &interpolate_info) + : handle(handle_ptr), info(interpolate_info) { + workspace_size = 0; + } + + template + size_t compute_input_index_1d(size_t idx) const { + size_t temp = idx; + + // 1D插值:3D张量 (N, C, W) + size_t w = temp % info.output_size[0]; + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale)), + info.input_size[0] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2]; + } + + // 计算2D插值的输入索引 + template + size_t compute_input_index_2d(size_t idx) const { + size_t temp = idx; + + // 2D插值:4D张量 (N, C, H, W) + size_t w = temp % info.output_size[1]; // width在索引1 + temp /= info.output_size[1]; + size_t h = temp % info.output_size[0]; // height在索引0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_h = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_w = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + + size_t input_h = std::min(static_cast(std::floor(static_cast(h) * inv_scale_h)), + info.input_size[0] - 1); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale_w)), + info.input_size[1] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3]; + } + + // 计算3D插值的输入索引 + template + size_t compute_input_index_3d(size_t idx) const { + size_t temp = idx; + + // 3D插值:5D张量 (N, C, D, H, W) + size_t w = temp % info.output_size[2]; // width在索引2 + temp /= info.output_size[2]; + size_t h = temp % info.output_size[1]; // height在索引1 + temp /= info.output_size[1]; + size_t d = temp % info.output_size[0]; // depth在索引0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_d = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_h = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + float inv_scale_w = static_cast(info.input_size[2]) / static_cast(info.output_size[2]); + + size_t input_d = std::min(static_cast(std::floor(static_cast(d) * inv_scale_d)), + info.input_size[0] - 1); + size_t input_h = std::min(static_cast(std::floor(static_cast(h) * inv_scale_h)), + info.input_size[1] - 1); + size_t input_w = std::min(static_cast(std::floor(static_cast(w) * inv_scale_w)), + info.input_size[2] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4]; + } + + // 计算输出索引 + template + size_t compute_output_index(size_t idx) const { + size_t temp = idx; + size_t w, h, d, c, b; + + switch (info.dim) { + case INTERPOLATE_1D: { + // 3D张量 (N, C, W) + w = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2]; + } + + case INTERPOLATE_2D: { + // 4D张量 (N, C, H, W) + w = temp % info.output_size[1]; + temp /= info.output_size[1]; + h = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3]; + } + + case INTERPOLATE_3D: { + // 5D张量 (N, C, D, H, W) + w = temp % info.output_size[2]; + temp /= info.output_size[2]; + h = temp % info.output_size[1]; + temp /= info.output_size[1]; + d = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4]; + } + + default: + return 0; + } + } + + // 计算总元素数 + size_t calculate_total_elements() const { + size_t total = info.batch_size * info.channels; + switch (info.dim) { + case INTERPOLATE_1D: + total *= info.output_size[0]; // width + break; + case INTERPOLATE_2D: + total *= info.output_size[0] * info.output_size[1]; // height * width + break; + case INTERPOLATE_3D: + total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width + break; + } + return total; + } + + // 主要的插值计算函数 + template + void interpolate_nearest_cpu(T *output, const T *input) const { + size_t total_elements = calculate_total_elements(); + +#pragma omp parallel for schedule(static) + for (ptrdiff_t idx = 0; idx < static_cast(total_elements); ++idx) { + size_t input_idx; + + switch (info.dim) { + case INTERPOLATE_1D: + input_idx = compute_input_index_1d(idx); + break; + case INTERPOLATE_2D: + input_idx = compute_input_index_2d(idx); + break; + case INTERPOLATE_3D: + input_idx = compute_input_index_3d(idx); + break; + default: + continue; + } + + size_t output_idx = compute_output_index(idx); + output[output_idx] = input[input_idx]; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + const InterpolateNearestInfo &info, + infiniDtype_t data_type) { + if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16 && data_type != INFINI_DTYPE_I8) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_F16: { + fp16_t *typed_output = static_cast(output); + const fp16_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_BF16: { + bf16_t *typed_output = static_cast(output); + const bf16_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_I8: { + int8_t *typed_output = static_cast(output); + const int8_t *typed_input = static_cast(input); + interpolate_nearest_cpu(typed_output, typed_input); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + // 检查数据类型支持 + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_I8); + + InterpolateNearestInfo info; + CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc)); + + auto opaque_result = Opaque::create(handle, info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, info, opaque->workspace_size, opaque, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::interpolate_nearest::cpu diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h new file mode 100644 index 000000000..78dd3ff97 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h @@ -0,0 +1,8 @@ +#ifndef __INTERPOLATE_NEAREST_CPU_H__ +#define __INTERPOLATE_NEAREST_CPU_H__ + +#include "../interpolate_nearest.h" + +DESCRIPTOR(cpu) + +#endif // __INTERPOLATE_NEAREST_CPU_H__ diff --git a/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh new file mode 100644 index 000000000..60c798792 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh @@ -0,0 +1,168 @@ +#ifndef INTERPOLATE_NEAREST_KERNEL_CUH +#define INTERPOLATE_NEAREST_KERNEL_CUH + +#include "../info.h" +#include + +template +__device__ inline size_t +compute_input_index_1d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 1D 插值:3D 张量 (N, C, W) + size_t w = temp % info.output_size[0]; // width 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale)), + info.input_size[0] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2]; +} + +template +__device__ inline size_t +compute_input_index_2d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 2D 插值:4D 张量 (N, C, H, W) + size_t w = temp % info.output_size[1]; // width 在索引 1 + temp /= info.output_size[1]; + size_t h = temp % info.output_size[0]; // height 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_h = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_w = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + + size_t input_h = min(static_cast(floorf(static_cast(h) * inv_scale_h)), + info.input_size[0] - 1); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale_w)), + info.input_size[1] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3]; +} + +template +__device__ inline size_t +compute_input_index_3d(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + + // 3D 插值:5D 张量 (N, C, D, H, W) + size_t w = temp % info.output_size[2]; // width 在索引 2 + temp /= info.output_size[2]; + size_t h = temp % info.output_size[1]; // height 在索引 1 + temp /= info.output_size[1]; + size_t d = temp % info.output_size[0]; // depth 在索引 0 + temp /= info.output_size[0]; + size_t c = temp % info.channels; + size_t b = temp / info.channels; + + float inv_scale_d = static_cast(info.input_size[0]) / static_cast(info.output_size[0]); + float inv_scale_h = static_cast(info.input_size[1]) / static_cast(info.output_size[1]); + float inv_scale_w = static_cast(info.input_size[2]) / static_cast(info.output_size[2]); + + size_t input_d = min(static_cast(floorf(static_cast(d) * inv_scale_d)), + info.input_size[0] - 1); + size_t input_h = min(static_cast(floorf(static_cast(h) * inv_scale_h)), + info.input_size[1] - 1); + size_t input_w = min(static_cast(floorf(static_cast(w) * inv_scale_w)), + info.input_size[2] - 1); + + return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4]; +} + +template +__device__ inline size_t +compute_output_index(size_t idx, const InterpolateNearestInfo &info) { + size_t temp = idx; + size_t w, h, d, c, b; + + switch (info.dim) { + case INTERPOLATE_1D: { + // 3D 张量 (N, C, W) + w = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2]; + } + + case INTERPOLATE_2D: { + // 4D 张量 (N, C, H, W) + w = temp % info.output_size[1]; + temp /= info.output_size[1]; + h = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3]; + } + + case INTERPOLATE_3D: { + // 5D 张量 (N, C, D, H, W) + w = temp % info.output_size[2]; + temp /= info.output_size[2]; + h = temp % info.output_size[1]; + temp /= info.output_size[1]; + d = temp % info.output_size[0]; + temp /= info.output_size[0]; + c = temp % info.channels; + b = temp / info.channels; + return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4]; + } + + default: + return 0; + } +} + +__host__ __device__ inline size_t +calculate_total_elements(const InterpolateNearestInfo &info) { + size_t total = info.batch_size * info.channels; + switch (info.dim) { + case INTERPOLATE_1D: + total *= info.output_size[0]; // width + break; + case INTERPOLATE_2D: + total *= info.output_size[0] * info.output_size[1]; // height * width + break; + case INTERPOLATE_3D: + total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width + break; + } + return total; +} + +template +__global__ void interpolate_nearest_kernel(T *output, const T *input, + InterpolateNearestInfo info) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total_elements = calculate_total_elements(info); + + if (idx < total_elements) { + size_t input_idx; + + switch (info.dim) { + case INTERPOLATE_1D: + input_idx = compute_input_index_1d(idx, info); + break; + case INTERPOLATE_2D: + input_idx = compute_input_index_2d(idx, info); + break; + case INTERPOLATE_3D: + input_idx = compute_input_index_3d(idx, info); + break; + default: + return; + } + + size_t output_idx = compute_output_index(idx, info); + output[output_idx] = input[input_idx]; + } +} + +#endif // INTERPOLATE_NEAREST_KERNEL_CUH diff --git a/src/infiniop/ops/interpolate_nearest/info.h b/src/infiniop/ops/interpolate_nearest/info.h new file mode 100644 index 000000000..162d6eb02 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/info.h @@ -0,0 +1,118 @@ +#ifndef __INTERPOLATE_NEAREST_INFO_H__ +#define __INTERPOLATE_NEAREST_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +enum InterpolateDim { + INTERPOLATE_1D = 1, // 3D 张量 (N, C, W) + INTERPOLATE_2D = 2, // 4D 张量 (N, C, H, W) + INTERPOLATE_3D = 3 // 5D 张量 (N, C, D, H, W) +}; + +struct InterpolateNearestInfo { + size_t batch_size; + size_t channels; + + // 输入和输出的空间维度大小 + size_t input_size[3]; // [depth/height/width] 根据维度使用不同数量 + size_t output_size[3]; // [depth/height/width] 根据维度使用不同数量 + + InterpolateDim dim; // 插值维度:1D, 2D, 3D + infiniDtype_t dtype; + + // 张量步长(最多支持 5D 张量) + size_t input_stride[5]; + size_t output_stride[5]; + + static infiniStatus_t create( + InterpolateNearestInfo *info, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + + // 检查数据类型 + if (input_desc->dtype() != output_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + auto input_shape = input_desc->shape(); + auto output_shape = output_desc->shape(); + auto input_stride = input_desc->strides(); + auto output_stride = output_desc->strides(); + + // 根据张量维度确定插值类型 + if (input_desc->ndim() == 3 && output_desc->ndim() == 3) { + // 1D 插值:3D 张量 (N, C, W) + info->dim = INTERPOLATE_1D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // width + info->output_size[0] = output_shape[2]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 3; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else if (input_desc->ndim() == 4 && output_desc->ndim() == 4) { + // 2D 插值:4D 张量 (N, C, H, W) + info->dim = INTERPOLATE_2D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // height + info->input_size[1] = input_shape[3]; // width + info->output_size[0] = output_shape[2]; // height + info->output_size[1] = output_shape[3]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 4; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else if (input_desc->ndim() == 5 && output_desc->ndim() == 5) { + // 3D 插值:5D 张量 (N, C, D, H, W) + info->dim = INTERPOLATE_3D; + info->batch_size = input_shape[0]; + info->channels = input_shape[1]; + info->input_size[0] = input_shape[2]; // depth + info->input_size[1] = input_shape[3]; // height + info->input_size[2] = input_shape[4]; // width + info->output_size[0] = output_shape[2]; // depth + info->output_size[1] = output_shape[3]; // height + info->output_size[2] = output_shape[4]; // width + + // 检查 N,C 维度匹配 + if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + // 复制步长 + for (int i = 0; i < 5; ++i) { + info->input_stride[i] = input_stride[i]; + info->output_stride[i] = output_stride[i]; + } + + } else { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info->dtype = input_desc->dtype(); + return INFINI_STATUS_SUCCESS; + } +}; + +#endif // __INTERPOLATE_NEAREST_INFO_H__ diff --git a/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h new file mode 100644 index 000000000..73499c2ff --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h @@ -0,0 +1,51 @@ +#ifndef __INTERPOLATE_NEAREST_H__ +#define __INTERPOLATE_NEAREST_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::interpolate_nearest::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + \ + InterpolateNearestInfo _info; \ + infiniDtype_t _dtype; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + InterpolateNearestInfo info, \ + size_t workspace_size, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _dtype(dtype), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __INTERPOLATE_NEAREST_H__ diff --git a/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h new file mode 100644 index 000000000..1619dbf2f --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h @@ -0,0 +1,8 @@ +#ifndef __INTERPOLATE_NEAREST_METAX_H__ +#define __INTERPOLATE_NEAREST_METAX_H__ + +#include "../interpolate_nearest.h" + +DESCRIPTOR(metax) + +#endif // __INTERPOLATE_NEAREST_METAX_H__ diff --git a/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca new file mode 100644 index 000000000..5cf0e5e66 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca @@ -0,0 +1,86 @@ +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" +#include "interpolate_nearest_metax.h" +#include +#include + +#include "../cuda/kernel.cuh" + +namespace op::interpolate_nearest::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + Opaque(std::shared_ptr internal_) : internal(internal_) {} +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 && + dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + InterpolateNearestInfo info; + CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc)); + + *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()}, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +template +inline void launch_interpolate_nearest_kernel(T *output, const T *input, InterpolateNearestInfo info, int grid_size, int block_size, hcStream_t stream) { + interpolate_nearest_kernel<<>>(output, input, info); +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + + int total_elements = calculate_total_elements(_info); + int block_size = 256; + int grid_size = (total_elements + block_size - 1) / block_size; + + switch (_dtype) { + case INFINI_DTYPE_F32: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + launch_interpolate_nearest_kernel(typed_output, typed_input, _info, grid_size, block_size, stream); + break; + } + case INFINI_DTYPE_F16: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + launch_interpolate_nearest_kernel(typed_output, typed_input, _info, grid_size, block_size, stream); + break; + } + case INFINI_DTYPE_BF16: { + auto typed_output = reinterpret_cast<__hpcc_bfloat16 *>(output); + auto typed_input = reinterpret_cast(input); + launch_interpolate_nearest_kernel<__hpcc_bfloat16>(typed_output, typed_input, _info, grid_size, block_size, stream); + break; + } + case INFINI_DTYPE_I8: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + launch_interpolate_nearest_kernel(typed_output, typed_input, _info, grid_size, block_size, stream); + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::interpolate_nearest::metax diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu new file mode 100644 index 000000000..a7b63c6f4 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu @@ -0,0 +1,93 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "interpolate_nearest_nvidia.cuh" +#include +#include +#include + +namespace op::interpolate_nearest::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + + Opaque(std::shared_ptr internal_) + : internal(internal_) {} +}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + + auto handle = reinterpret_cast(handle_); + auto dtype = output_desc->dtype(); + + // Check supported data types + if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + InterpolateNearestInfo info; + CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc)); + + *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + + auto cuda_stream = reinterpret_cast(stream); + + size_t total_elements = calculate_total_elements(_info); + + int block_size = 256; + int grid_size = (total_elements + block_size - 1) / block_size; + + switch (_dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = reinterpret_cast(output); + const float *typed_input = reinterpret_cast(input); + interpolate_nearest_kernel + <<>>(typed_output, typed_input, + _info); + } break; + + case INFINI_DTYPE_F16: { + half *typed_output = reinterpret_cast(output); + const half *typed_input = reinterpret_cast(input); + interpolate_nearest_kernel<<>>( + typed_output, typed_input, _info); + } break; + + case INFINI_DTYPE_BF16: { + auto typed_output = reinterpret_cast<__nv_bfloat16 *>(output); + auto typed_input = reinterpret_cast(input); + interpolate_nearest_kernel<__nv_bfloat16> + <<>>(typed_output, typed_input, + _info); + } break; + + case INFINI_DTYPE_I8: { + auto typed_output = reinterpret_cast(output); + auto typed_input = reinterpret_cast(input); + interpolate_nearest_kernel + <<>>(typed_output, typed_input, + _info); + } break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + CHECK_CUDA(cudaGetLastError()); + CHECK_CUDA(cudaStreamSynchronize(cuda_stream)); + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::interpolate_nearest::nvidia diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh new file mode 100644 index 000000000..aab5f7882 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh @@ -0,0 +1,9 @@ +#ifndef __INTERPOLATE_NEAREST_NVIDIA_CUH__ +#define __INTERPOLATE_NEAREST_NVIDIA_CUH__ + +#include "../../../devices/nvidia/nvidia_handle.h" +#include "../interpolate_nearest.h" + +DESCRIPTOR(nvidia) + +#endif // __INTERPOLATE_NEAREST_NVIDIA_CUH__ diff --git a/src/infiniop/ops/interpolate_nearest/operator.cc b/src/infiniop/ops/interpolate_nearest/operator.cc new file mode 100644 index 000000000..0a0f99ee1 --- /dev/null +++ b/src/infiniop/ops/interpolate_nearest/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/interpolate_nearest.h" + +#ifdef ENABLE_CPU_API +#include "cpu/interpolate_nearest_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/interpolate_nearest_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/interpolate_nearest_metax.h" +#endif + +__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor( + infiniopHandle_t handle, + infiniopInterpolateNearestDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::interpolate_nearest::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize( + infiniopInterpolateNearestDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopInterpolateNearest( + infiniopInterpolateNearestDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor( + infiniopInterpolateNearestDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.cc b/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.cc new file mode 100644 index 000000000..8833cb999 --- /dev/null +++ b/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.cc @@ -0,0 +1,126 @@ +#include "layer_norm_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::layer_norm::cpu { + +template +infiniStatus_t calculate_layer_norm( + const LayerNormInfo &info, + Tdata * output, + Tdata * input_standardization, + Tdata * input_std_deviation, + const Tdata * input, + const Tdata * weight, + const Tdata * bias +) { +// -------------------------------- start: perform operator on CPU -------------------------------- + #pragma omp parallel for + for(size_t b = 0; b < info.input_shape[0] * info.input_shape[1]; b ++) + { + size_t b0 = b / info.input_shape[1], b1 = b % info.input_shape[1]; + auto output_ptr = output + b0 * info.output_strides[0] + b1 * info.output_strides[1]; + auto input_ptr = input + b0 * info.input_strides[0] + b1 * info.input_strides[1]; + auto standard_ptr = input_standardization + b0 * info.input_standardization_strides[0] + b1 * info.input_standardization_strides[1]; + auto std_ptr = input_std_deviation + b0 * info.input_std_deviation_strides[0] + b1 * info.input_std_deviation_strides[1]; + float mean = op::common_cpu::reduce_op::sum( + input_ptr, + info.normalized_size, + info.input_strides[2] + ) / info.input_shape[2]; + float sum_sq = op::common_cpu::reduce_op::sumSquared( + input_ptr, + info.normalized_size, + info.input_strides[2] + ); + float var = sum_sq / (info.normalized_size) - mean * mean; + float std_deviation = std::sqrt(var + info.eps); + *std_ptr = utils::cast(std_deviation); + + for(size_t d = 0; d < info.normalized_size; d ++) + { + float x_standard = (utils::cast(*(input_ptr + d * info.input_strides[2])) - mean) / std_deviation; + *(standard_ptr + d * info.input_standardization_strides[2]) = utils::cast(x_standard); + *(output_ptr + d * info.output_strides[2]) = utils::cast( + x_standard * utils::cast(*(weight + d * info.weight_strides[0])) + \ + (info.bias_exist ? utils::cast(*(bias + d * info.bias_strides[0])) : float(0)) + ); + } + + + +} +// --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_standardization_desc, + infiniopTensorDescriptor_t input_std_deviation_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float eps +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = input_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = LayerNormInfo::createLayerNormInfo( + output_desc, + input_standardization_desc, + input_std_deviation_desc, + input_desc, + weight_desc, + bias_desc, + eps + ); + CHECK_RESULT(result); + const LayerNormInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +#define CALCULATE_LAYER_NORM(TDATA) \ + CHECK_STATUS(calculate_layer_norm(_info, \ +(TDATA *)output, (TDATA *)input_standardization, (TDATA *)input_std_deviation, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias)) + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + void * input_standardization, + void * input_std_deviation, + const void * input, + const void * weight, + const void * bias, + void *stream +) const { + if (_info.dtype == INFINI_DTYPE_F16) { + CALCULATE_LAYER_NORM(fp16_t); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CALCULATE_LAYER_NORM(bf16_t); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CALCULATE_LAYER_NORM(float); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.h b/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.h new file mode 100644 index 000000000..710373a4b --- /dev/null +++ b/src/infiniop/ops/layer_norm/cpu/layer_norm_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LAYER_NORM_CPU_H__ +#define __LAYER_NORM_CPU_H__ + +#include "../layer_norm.h" + +DESCRIPTOR(cpu) + + +#endif // __LAYER_NORM_CPU_H__ diff --git a/src/infiniop/ops/layer_norm/cuda/kernel.cuh b/src/infiniop/ops/layer_norm/cuda/kernel.cuh new file mode 100644 index 000000000..6a2ba964a --- /dev/null +++ b/src/infiniop/ops/layer_norm/cuda/kernel.cuh @@ -0,0 +1,49 @@ +#ifndef __LAYER_NORM_KERNEL_CUH__ +#define __LAYER_NORM_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void layerNormKernel( + Tdata * output, + Tdata * input_standardization, + Tdata * input_std_deviation, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + float eps, + size_t normalized_size, + const ptrdiff_t* output_strides, + const ptrdiff_t* input_standardization_strides, + const ptrdiff_t* input_std_deviation_strides, + const ptrdiff_t* input_strides, + ptrdiff_t weight_stride, + ptrdiff_t bias_stride, + bool bias_exist +) { + size_t b0 = blockIdx.x, b1 = blockIdx.y; + + auto output_ptr = output + b0 * output_strides[0] + b1 * output_strides[1]; + auto input_ptr = input + b0 * input_strides[0] + b1 * input_strides[1]; + auto standard_ptr = input_standardization + b0 * input_standardization_strides[0] + b1 * input_standardization_strides[1]; + auto std_ptr = input_std_deviation + b0 * input_std_deviation_strides[0] + b1 * input_std_deviation_strides[1]; + Tcompute mean = op::common_cuda::reduce_op::sum( + input_ptr, + normalized_size + ) / normalized_size; + Tcompute sum_squared = op::common_cuda::reduce_op::sumSquared( + input_ptr, + normalized_size + ); + + Tcompute var = sum_squared / normalized_size - mean * mean; + Tcompute std_deviation = sqrtf(var + Tcompute(eps)); + *std_ptr = std_deviation; + + for (size_t d = 0; d < normalized_size; d ++) { + Tcompute x_standard = (Tcompute(input_ptr[d]) - mean) / std_deviation; + standard_ptr[d] = x_standard; + output_ptr[d] = x_standard * Tcompute(*(weight + d * weight_stride)) + (bias_exist ? Tcompute(*(bias + d * bias_stride)) : Tcompute(0)); + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __LAYER_NORM_KERNEL_CUH__ diff --git a/src/infiniop/ops/layer_norm/info.h b/src/infiniop/ops/layer_norm/info.h new file mode 100644 index 000000000..9b6c01692 --- /dev/null +++ b/src/infiniop/ops/layer_norm/info.h @@ -0,0 +1,84 @@ +#ifndef __LAYER_NORM_INFO_H__ +#define __LAYER_NORM_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::layer_norm { + +class LayerNormInfo { +private: + LayerNormInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t ndim; + std::vector input_shape; + size_t normalized_size; + std::vector output_strides; + std::vector input_standardization_strides; + std::vector input_std_deviation_strides; + std::vector input_strides; + std::vector weight_strides; + std::vector bias_strides; + float eps; + bool bias_exist; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createLayerNormInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_standardization_desc, + infiniopTensorDescriptor_t input_std_deviation_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float eps + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_SAME_SHAPE( + output_desc->shape(), input_desc->shape(), input_standardization_desc->shape() + ); + size_t batch_size = input_desc->dim(0), + channel_size = input_desc->dim(1), + feature_size = input_desc->dim(2); + + bool bias_exist = bias_desc != nullptr; + CHECK_OR_RETURN( + (!bias_exist) || (bias_desc->ndim() == 1 && bias_desc->dim(0) == feature_size), + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + CHECK_OR_RETURN( + (weight_desc->ndim() == 1) && (weight_desc->dim(0) == feature_size), + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + CHECK_OR_RETURN( + input_std_deviation_desc->ndim() == 2 && \ + input_std_deviation_desc->dim(0) == batch_size && \ + input_std_deviation_desc->dim(1) == channel_size, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(LayerNormInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + input_desc->ndim(), + input_desc->shape(), + input_desc->dim(input_desc->ndim() - 1), + output_desc->strides(), + input_standardization_desc->strides(), + input_std_deviation_desc->strides(), + input_desc->strides(), + weight_desc->strides(), + bias_exist ? bias_desc->strides() : std::vector(), + eps, + bias_exist +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __LAYER_NORM_INFO_H__ diff --git a/src/infiniop/ops/layer_norm/layer_norm.h b/src/infiniop/ops/layer_norm/layer_norm.h new file mode 100644 index 000000000..8b847a80d --- /dev/null +++ b/src/infiniop/ops/layer_norm/layer_norm.h @@ -0,0 +1,55 @@ +#ifndef __LAYER_NORM_H__ +#define __LAYER_NORM_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::layer_norm::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + LayerNormInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + LayerNormInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_standardization_desc, \ + infiniopTensorDescriptor_t input_std_deviation_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t weight_desc, \ + infiniopTensorDescriptor_t bias_desc, \ + float eps \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + void * input_standardization, \ + void * input_std_deviation, \ + const void * input, \ + const void * weight, \ + const void * bias, \ + void *stream \ + ) const; \ + }; \ + } + +#endif diff --git a/src/infiniop/ops/layer_norm/metax/layer_norm_metax.h b/src/infiniop/ops/layer_norm/metax/layer_norm_metax.h new file mode 100644 index 000000000..6634031a3 --- /dev/null +++ b/src/infiniop/ops/layer_norm/metax/layer_norm_metax.h @@ -0,0 +1,8 @@ +#ifndef __LAYER_NORM_METAX_H__ +#define __LAYER_NORM_METAX_H__ + +#include "../layer_norm.h" + +DESCRIPTOR(metax) + +#endif // __LAYER_NORM_METAX_H__ diff --git a/src/infiniop/ops/layer_norm/metax/layer_norm_metax.maca b/src/infiniop/ops/layer_norm/metax/layer_norm_metax.maca new file mode 100644 index 000000000..ee6080999 --- /dev/null +++ b/src/infiniop/ops/layer_norm/metax/layer_norm_metax.maca @@ -0,0 +1,182 @@ +#include "../../../devices/metax/metax_common.h" +#include "layer_norm_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::layer_norm::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + Tdata * input_standardization, + Tdata * input_std_deviation, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + float eps, + size_t normalized_size, + const ptrdiff_t* output_strides, + const ptrdiff_t* input_standardization_strides, + const ptrdiff_t* input_std_deviation_strides, + const ptrdiff_t* input_strides, + ptrdiff_t weight_stride, + ptrdiff_t bias_stride, + bool bias_exist +) { + layerNormKernel( + output, + input_standardization, + input_std_deviation, + input, + weight, + bias, + eps, + normalized_size, + output_strides, + input_standardization_strides, + input_std_deviation_strides, + input_strides, + weight_stride, + bias_stride, + bias_exist + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_layer_norm( + const LayerNormInfo &info, + Tdata * output, + Tdata * input_standardization, + Tdata * input_std_deviation, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + hcStream_t stream, + void *workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * input_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * input_standardization_strides_cuda = output_strides_cuda + ndim; + ptrdiff_t * input_std_deviation_strides_cuda = input_standardization_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_standardization_strides_cuda, info.input_standardization_strides.data(), sizeof(ptrdiff_t) * (ndim - 1), hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_std_deviation_strides_cuda, info.input_std_deviation_strides.data(), sizeof(ptrdiff_t) * (ndim - 1), hcMemcpyHostToDevice, stream)); + + launchKernel<1, Tdata, float><<>>( + output, + input_standardization, + input_std_deviation, + input, + weight, + bias, + info.eps, + info.normalized_size, + output_strides_cuda, + input_standardization_strides_cuda, + input_std_deviation_strides_cuda, + input_strides_cuda, + info.weight_strides[0], + info.bias_exist ? info.bias_strides[0] : 0, + info.bias_exist + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_standardization_desc, + infiniopTensorDescriptor_t input_std_deviation_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float eps +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + auto result = LayerNormInfo::createLayerNormInfo( + output_desc, + input_standardization_desc, + input_std_deviation_desc, + input_desc, + weight_desc, + bias_desc, + eps + ); + CHECK_RESULT(result); + const LayerNormInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + void * input_standardization, + void * input_std_deviation, + const void * input, + const void * weight, + const void * bias, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_LAYER_NORM(BLOCK_SIZE, TDATA) \ + calculate_layer_norm(_info, (TDATA *)output, (TDATA *)input_standardization, (TDATA *)input_std_deviation, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream, workspace) + #define CALCULATE_LAYER_NORM_WITH_MATEX_BLOCK(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_LAYER_NORM(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_LAYER_NORM(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_LAYER_NORM(BLOCK_SIZE, cuda_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_LAYER_NORM_WITH_MATEX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_LAYER_NORM_WITH_MATEX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_LAYER_NORM_WITH_MATEX_BLOCK + #undef CALCULATE_LAYER_NORM +} +} // namespace op::layer_norm::metax diff --git a/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu new file mode 100644 index 000000000..5d512423d --- /dev/null +++ b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu @@ -0,0 +1,184 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "layer_norm_nvidia.cuh" +#include "../info.h" + +namespace op::layer_norm::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * output, + Tdata * input_standardization, + Tdata * input_std_deviation, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + float eps, + size_t normalized_size, + const ptrdiff_t* output_strides, + const ptrdiff_t* input_standardization_strides, + const ptrdiff_t* input_std_deviation_strides, + const ptrdiff_t* input_strides, + ptrdiff_t weight_stride, + ptrdiff_t bias_stride, + bool bias_exist +) { + layerNormKernel( + output, + input_standardization, + input_std_deviation, + input, + weight, + bias, + eps, + normalized_size, + output_strides, + input_standardization_strides, + input_std_deviation_strides, + input_strides, + weight_stride, + bias_stride, + bias_exist + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_layer_norm( + const LayerNormInfo &info, + Tdata * output, + Tdata * input_standardization, + Tdata * input_std_deviation, + const Tdata * input, + const Tdata * weight, + const Tdata * bias, + cudaStream_t stream, + void *workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * input_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * input_standardization_strides_cuda = output_strides_cuda + ndim; + ptrdiff_t * input_std_deviation_strides_cuda = input_standardization_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_standardization_strides_cuda, info.input_standardization_strides.data(), sizeof(ptrdiff_t) * (ndim - 1), cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_std_deviation_strides_cuda, info.input_std_deviation_strides.data(), sizeof(ptrdiff_t) * (ndim - 1), cudaMemcpyHostToDevice, stream)); + + launchKernel<1, Tdata, float><<>>( + output, + input_standardization, + input_std_deviation, + input, + weight, + bias, + info.eps, + info.normalized_size, + output_strides_cuda, + input_standardization_strides_cuda, + input_std_deviation_strides_cuda, + input_strides_cuda, + info.weight_strides[0], + info.bias_exist ? info.bias_strides[0] : 0, + info.bias_exist + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_standardization_desc, + infiniopTensorDescriptor_t input_std_deviation_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float eps +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + size_t WorkSpaceSize = output_desc->ndim() * sizeof(size_t) * 5; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = LayerNormInfo::createLayerNormInfo( + output_desc, + input_standardization_desc, + input_std_deviation_desc, + input_desc, + weight_desc, + bias_desc, + eps + ); + CHECK_RESULT(result); + const LayerNormInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + void * input_standardization, + void * input_std_deviation, + const void * input, + const void * weight, + const void * bias, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_LAYER_NORM(BLOCK_SIZE, TDATA) \ + calculate_layer_norm(_info, (TDATA *)output, (TDATA *)input_standardization, (TDATA *)input_std_deviation, (const TDATA *)input, (const TDATA *)weight, (const TDATA *)bias, stream, workspace) + #define CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_LAYER_NORM(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_LAYER_NORM(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_LAYER_NORM(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) + CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) + CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) + CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::layer_norm::nvidia diff --git a/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cuh b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cuh new file mode 100644 index 000000000..bb770d19a --- /dev/null +++ b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __LAYER_NORM_NVIDIA_API_H__ +#define __LAYER_NORM_NVIDIA_API_H__ +#include "../layer_norm.h" + +DESCRIPTOR(nvidia) + +#endif // __LAYER_NORM_NVIDIA_API_H__ diff --git a/src/infiniop/ops/layer_norm/operator.cc b/src/infiniop/ops/layer_norm/operator.cc new file mode 100644 index 000000000..846c9ce7f --- /dev/null +++ b/src/infiniop/ops/layer_norm/operator.cc @@ -0,0 +1,166 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/layer_norm.h" + +#ifdef ENABLE_CPU_API +#include "cpu/layer_norm_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/layer_norm_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/layer_norm_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLayerNormDescriptor( + infiniopHandle_t handle, + infiniopLayerNormDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_standardization_desc, + infiniopTensorDescriptor_t input_std_deviation_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t weight_desc, + infiniopTensorDescriptor_t bias_desc, + float eps +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::layer_norm::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_standardization_desc, \ + input_std_deviation_desc, \ + input_desc, \ + weight_desc, \ + bias_desc, \ + eps \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLayerNormWorkspaceSize(infiniopLayerNormDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLayerNorm( + infiniopLayerNormDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + void * input_standardization, + void * input_std_deviation, + const void * input, + const void * weight, + const void * bias, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input_standardization, \ + input_std_deviation, \ + input, \ + weight, \ + bias, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc new file mode 100644 index 000000000..c81f0a539 --- /dev/null +++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc @@ -0,0 +1,66 @@ +#include "logical_and_cpu.h" +#include "infinicore.h" + +namespace op::logical_and::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_and::cpu diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h new file mode 100644 index 000000000..701960bd5 --- /dev/null +++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h @@ -0,0 +1,29 @@ +#ifndef __LOGICAL_AND_CPU_H__ +#define __LOGICAL_AND_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_and, cpu) + +namespace op::logical_and::cpu { +typedef struct LogicalAndOp { +public: + static constexpr size_t num_inputs = 2; + template + Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + if constexpr (std::is_same_v || std::is_same_v) { + float f_a = utils::cast(a); + float f_b = utils::cast(b); + return f_a && f_b; + } else { + return a && b; + } + } +} LogicalAndOp; +} // namespace op::logical_and::cpu + +#endif // __LOGICAL_AND_CPU_H__ diff --git a/src/infiniop/ops/logical_and/cuda/kernel.cuh b/src/infiniop/ops/logical_and/cuda/kernel.cuh new file mode 100644 index 000000000..0b763d951 --- /dev/null +++ b/src/infiniop/ops/logical_and/cuda/kernel.cuh @@ -0,0 +1,19 @@ +#ifndef __LOGICAL_AND_CUDA_H__ +#define __LOGICAL_AND_CUDA_H__ + +namespace op::logical_and::cuda { +typedef struct LogicalAndOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + return a && b; + } +} LogicalAndOp; +} // namespace op::logical_and::cuda + +#endif // __LOGICAL_AND_CUDA_H__ diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.h b/src/infiniop/ops/logical_and/metax/logical_and_metax.h new file mode 100644 index 000000000..696697322 --- /dev/null +++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_AND_METAX_API_H__ +#define __LOGICAL_AND_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(logical_and, metax) + +#endif // __LOGICAL_AND_METAX_API_H__ diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.maca b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca new file mode 100644 index 000000000..68e3f0ddb --- /dev/null +++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca @@ -0,0 +1,73 @@ +#include "logical_and_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::logical_and::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_and::metax diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu new file mode 100644 index 000000000..7f0680a57 --- /dev/null +++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu @@ -0,0 +1,73 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_and_nvidia.cuh" + +namespace op::logical_and::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_and::nvidia diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh new file mode 100644 index 000000000..9d68754bf --- /dev/null +++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_AND_CUDA_API_H__ +#define __LOGICAL_AND_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_and, nvidia) + +#endif // __LOGICAL_AND_CUDA_API_H__ diff --git a/src/infiniop/ops/logical_and/operator.cc b/src/infiniop/ops/logical_and/operator.cc new file mode 100644 index 000000000..ccc168a90 --- /dev/null +++ b/src/infiniop/ops/logical_and/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/logical_and.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_and_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/logical_and_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/logical_and_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLogicalAndDescriptor( + infiniopHandle_t handle, + infiniopLogicalAndDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_and::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLogicalAnd( + infiniopLogicalAndDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc new file mode 100644 index 000000000..1324c98f1 --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc @@ -0,0 +1,66 @@ +#include "logical_or_cpu.h" +#include "infinicore.h" + +namespace op::logical_or::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_or::cpu diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h new file mode 100644 index 000000000..7c26c4d37 --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h @@ -0,0 +1,29 @@ +#ifndef __LOGICAL_OR_CPU_H__ +#define __LOGICAL_OR_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_or, cpu) + +namespace op::logical_or::cpu { +typedef struct LogicalOrOp { +public: + static constexpr size_t num_inputs = 2; + template + Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + if constexpr (std::is_same_v || std::is_same_v) { + float f_a = utils::cast(a); + float f_b = utils::cast(b); + return f_a || f_b; + } else { + return a || b; + } + } +} LogicalOrOp; +} // namespace op::logical_or::cpu + +#endif // __LOGICAL_OR_CPU_H__ diff --git a/src/infiniop/ops/logical_or/cuda/kernel.cuh b/src/infiniop/ops/logical_or/cuda/kernel.cuh new file mode 100644 index 000000000..3c705428e --- /dev/null +++ b/src/infiniop/ops/logical_or/cuda/kernel.cuh @@ -0,0 +1,19 @@ +#ifndef __LOGICAL_OR_CUDA_H__ +#define __LOGICAL_OR_CUDA_H__ + +namespace op::logical_or::cuda { +typedef struct LogicalOrOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + return a || b; + } +} LogicalOrOp; +} // namespace op::logical_or::cuda + +#endif // __LOGICAL_OR_CUDA_H__ diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.h b/src/infiniop/ops/logical_or/metax/logical_or_metax.h new file mode 100644 index 000000000..e530d9ed5 --- /dev/null +++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_OR_METAX_API_H__ +#define __LOGICAL_OR_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(logical_or, metax) + +#endif // __LOGICAL_OR_METAX_API_H__ diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.maca b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca new file mode 100644 index 000000000..17d1f8ed0 --- /dev/null +++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca @@ -0,0 +1,73 @@ +#include "logical_or_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::logical_or::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_or::metax diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu new file mode 100644 index 000000000..151079f07 --- /dev/null +++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu @@ -0,0 +1,73 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_or_nvidia.cuh" + +namespace op::logical_or::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_or::nvidia diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh new file mode 100644 index 000000000..a70bd8da7 --- /dev/null +++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_OR_CUDA_API_H__ +#define __LOGICAL_OR_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_or, nvidia) + +#endif // __LOGICAL_OR_CUDA_API_H__ diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc new file mode 100644 index 000000000..463d7fc1b --- /dev/null +++ b/src/infiniop/ops/logical_or/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/logical_or.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_or_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/logical_or_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/logical_or_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLogicalOrDescriptor( + infiniopHandle_t handle, + infiniopLogicalOrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_or::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLogicalOr( + infiniopLogicalOrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc new file mode 100644 index 000000000..5c729e7e8 --- /dev/null +++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc @@ -0,0 +1,322 @@ +#include "maxpool_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../devices/cpu/cpu_handle.h" +#include "../info.h" +#include +#include +#include +#include + +namespace op::maxpool::cpu { + +struct Descriptor::Opaque { + device::cpu::Handle *handle; + MaxPoolInfo info; + size_t workspace_size = 0; + +private: + Opaque(device::cpu::Handle *handle_ptr, const MaxPoolInfo &maxpool_info) + : handle(handle_ptr), info(maxpool_info) { + // CPU实现通常不需要额外的工作空间 + workspace_size = 0; + } + + // 获取数据类型的最小值 + template + static T get_min_value() { + if constexpr (std::is_same::value) { + return -std::numeric_limits::infinity(); + } else if constexpr (std::is_same::value) { + return _f32_to_f16(-std::numeric_limits::infinity()); + } else if constexpr (std::is_same::value) { + return _f32_to_bf16(-std::numeric_limits::infinity()); + } else { + return std::numeric_limits::lowest(); + } + } + + // 比较两个值的大小(处理半精度类型) + template + static bool is_greater(const T &a, const T &b) { + if constexpr (std::is_same::value) { + return utils::cast(a) > utils::cast(b); + } else if constexpr (std::is_same::value) { + return utils::cast(a) > utils::cast(b); + } else { + return a > b; + } + } + + // 1D最大池化 + template + void maxpool_1d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_width = info.input_dims[0]; + size_t output_width = info.output_dims[0]; + size_t kernel_width = info.kernel_sizes[0]; + size_t stride_width = info.strides[0]; + size_t pad_width = info.pads[0]; + + // 并行处理每个批次和通道 +#pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_width + c * input_width; + size_t output_offset = b * channels * output_width + c * output_width; + + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kw = start_w; kw < end_w; ++kw) { + if (kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + + output[output_offset + ow] = max_val; + } + } + } + } + + // 2D最大池化 + template + void maxpool_2d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_height = info.input_dims[0]; + size_t input_width = info.input_dims[1]; + size_t output_height = info.output_dims[0]; + size_t output_width = info.output_dims[1]; + size_t kernel_height = info.kernel_sizes[0]; + size_t kernel_width = info.kernel_sizes[1]; + size_t stride_height = info.strides[0]; + size_t stride_width = info.strides[1]; + size_t pad_height = info.pads[0]; + size_t pad_width = info.pads[1]; + + // 并行处理每个批次和通道 +#pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_height * input_width + c * input_height * input_width; + size_t output_offset = b * channels * output_height * output_width + c * output_height * output_width; + + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kh * input_width + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + } + + output[output_offset + oh * output_width + ow] = max_val; + } + } + } + } + } + + // 3D最大池化 + template + void maxpool_3d(T *output, const T *input) const { + size_t batch_size = info.batch; + size_t channels = info.channels; + size_t input_depth = info.input_dims[0]; + size_t input_height = info.input_dims[1]; + size_t input_width = info.input_dims[2]; + size_t output_depth = info.output_dims[0]; + size_t output_height = info.output_dims[1]; + size_t output_width = info.output_dims[2]; + size_t kernel_depth = info.kernel_sizes[0]; + size_t kernel_height = info.kernel_sizes[1]; + size_t kernel_width = info.kernel_sizes[2]; + size_t stride_depth = info.strides[0]; + size_t stride_height = info.strides[1]; + size_t stride_width = info.strides[2]; + size_t pad_depth = info.pads[0]; + size_t pad_height = info.pads[1]; + size_t pad_width = info.pads[2]; + + // 并行处理每个批次和通道 +#pragma omp parallel for collapse(2) schedule(static) + for (size_t b = 0; b < batch_size; ++b) { + for (size_t c = 0; c < channels; ++c) { + size_t input_offset = b * channels * input_depth * input_height * input_width + c * input_depth * input_height * input_width; + size_t output_offset = b * channels * output_depth * output_height * output_width + c * output_depth * output_height * output_width; + + for (size_t od = 0; od < output_depth; ++od) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + T max_val = get_min_value(); + bool found_valid = false; + + int start_d = static_cast(od * stride_depth) - static_cast(pad_depth); + int end_d = start_d + static_cast(kernel_depth); + int start_h = static_cast(oh * stride_height) - static_cast(pad_height); + int end_h = start_h + static_cast(kernel_height); + int start_w = static_cast(ow * stride_width) - static_cast(pad_width); + int end_w = start_w + static_cast(kernel_width); + + for (int kd = start_d; kd < end_d; ++kd) { + for (int kh = start_h; kh < end_h; ++kh) { + for (int kw = start_w; kw < end_w; ++kw) { + if (kd >= 0 && kd < static_cast(input_depth) && kh >= 0 && kh < static_cast(input_height) && kw >= 0 && kw < static_cast(input_width)) { + T val = input[input_offset + kd * input_height * input_width + kh * input_width + kw]; + if (!found_valid || is_greater(val, max_val)) { + max_val = val; + found_valid = true; + } + } + } + } + } + + output[output_offset + od * output_height * output_width + oh * output_width + ow] = max_val; + } + } + } + } + } + } + + // 主要的最大池化计算函数 + template + void maxpool_cpu(T *output, const T *input) const { + switch (info.ndim) { + case 1: + maxpool_1d(output, input); + break; + case 2: + maxpool_2d(output, input); + break; + case 3: + maxpool_3d(output, input); + break; + default: + break; + } + } + +public: + Opaque(Opaque &&other) noexcept + : handle(other.handle), + info(std::move(other.info)), + workspace_size(other.workspace_size) { + other.handle = nullptr; + other.workspace_size = 0; + } + + ~Opaque() = default; + + static inline utils::Result + create(device::cpu::Handle *handle_ptr, + MaxPoolInfo &info, + infiniDtype_t data_type) { + if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + Opaque opaque(handle_ptr, info); + return utils::Result(std::move(opaque)); + } + + infiniStatus_t calculate(void *workspace, size_t workspace_size, + void *output, const void *input, infiniDtype_t dtype) const { + + if (!output || !input) { + return INFINI_STATUS_BAD_PARAM; + } + + switch (dtype) { + case INFINI_DTYPE_F32: { + float *typed_output = static_cast(output); + const float *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_F16: { + fp16_t *typed_output = static_cast(output); + const fp16_t *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + case INFINI_DTYPE_BF16: { + bf16_t *typed_output = static_cast(output); + const bf16_t *typed_input = static_cast(input); + maxpool_cpu(typed_output, typed_input); + break; + } + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16); + + auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle, info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + return _opaque->calculate(workspace, workspace_size, output, input, _dtype); +} + +} // namespace op::maxpool::cpu diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h new file mode 100644 index 000000000..f3ecd349d --- /dev/null +++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h @@ -0,0 +1,8 @@ +#ifndef __MAX_POOL_CPU_H__ +#define __MAX_POOL_CPU_H__ + +#include "../maxpool.h" + +DESCRIPTOR(cpu) + +#endif // __MAX_POOL_CPU_H__ diff --git a/src/infiniop/ops/maxpool/info.h b/src/infiniop/ops/maxpool/info.h new file mode 100644 index 000000000..ff56fe28c --- /dev/null +++ b/src/infiniop/ops/maxpool/info.h @@ -0,0 +1,113 @@ +#ifndef __MAX_POOL_INFO_H__ +#define __MAX_POOL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +namespace op::maxpool { + +inline utils::Result calculateMaxPoolOutputSize( + size_t input_size, + size_t kernel_size, + size_t stride, + size_t padding = 0, + bool ceil_mode = false) { + + if (stride == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + if (kernel_size == 0) { + return utils::Result(INFINI_STATUS_BAD_PARAM); + } + + // 理论最大输出数 + size_t max_output = 0; + if (ceil_mode) { + max_output = (input_size + 2 * padding - kernel_size + stride - 1) / stride + 1; + } else { + max_output = (input_size + 2 * padding - kernel_size) / stride + 1; + } + + size_t valid_output = 0; + for (size_t i = 0; i < max_output; ++i) { + int64_t start = static_cast(i) * stride - padding; + int64_t end = start + kernel_size; + // 判断区间 [start, end) 和 [0, input_size) 是否有交集 + int64_t real_start = std::max(start, int64_t(0)); + int64_t real_end = std::min(end, int64_t(input_size)); + if (real_end > real_start) { + ++valid_output; + } + } + return utils::Result(valid_output); +} + +class MaxPoolInfo { + MaxPoolInfo() = default; + +public: + std::vector input_dims; + std::vector output_dims; + std::vector kernel_sizes; + std::vector strides; + std::vector pads; + bool ceil_mode; + size_t ndim; + size_t batch; + size_t channels; + + static utils::Result create( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + + MaxPoolInfo info; + + if (input_desc->ndim() < 3 || input_desc->ndim() > 5) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->ndim() != output_desc->ndim()) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.ndim = input_desc->ndim() - 2; // spatial dimensions + info.batch = input_desc->dim(0); + info.channels = input_desc->dim(1); + info.ceil_mode = ceil_mode; + + auto kernel_ptr = reinterpret_cast(kernel_size); + auto stride_ptr = reinterpret_cast(strides); + auto pad_ptr = reinterpret_cast(pads); + + // Get spatial dimensions + for (size_t i = 0; i < info.ndim; ++i) { + info.input_dims.push_back(input_desc->dim(i + 2)); + info.kernel_sizes.push_back(kernel_ptr[i]); + info.strides.push_back(stride_ptr[i]); + info.pads.push_back(pad_ptr[i]); + auto output_size = calculateMaxPoolOutputSize( + info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode); + CHECK_RESULT(output_size); + size_t expected_size = output_size.take(); + if (expected_size != output_desc->dim(i + 2)) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + info.output_dims.push_back(output_desc->dim(i + 2)); + } + return utils::Result(std::move(info)); + } +}; +} // namespace op::maxpool + +#endif // __MAX_POOL_INFO_H__ diff --git a/src/infiniop/ops/maxpool/maxpool.h b/src/infiniop/ops/maxpool/maxpool.h new file mode 100644 index 000000000..5ee7703c5 --- /dev/null +++ b/src/infiniop/ops/maxpool/maxpool.h @@ -0,0 +1,53 @@ +#ifndef __MAX_POOL_H__ +#define __MAX_POOL_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::maxpool::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + infiniDtype_t _dtype; \ + MaxPoolInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + MaxPoolInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _dtype(dtype), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + void *kernel_size, \ + void *strides, \ + void *pads, \ + bool ceil_mode); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __MAX_POOL_H__ diff --git a/src/infiniop/ops/maxpool/metax/maxpool_metax.cc b/src/infiniop/ops/maxpool/metax/maxpool_metax.cc new file mode 100644 index 000000000..b70286abd --- /dev/null +++ b/src/infiniop/ops/maxpool/metax/maxpool_metax.cc @@ -0,0 +1,217 @@ +#include "maxpool_metax.h" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_handle.h" + +#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_hcdnn_DESCRIPTORS() \ + do { \ + DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(output_desc, hcdnnDestroyTensorDescriptor); \ + DESTROY_hcdnn_DESCRIPTOR(pooling_desc, hcdnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::maxpool::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_HCDNN_API + hcdnnTensorDescriptor_t input_desc = nullptr; + hcdnnTensorDescriptor_t output_desc = nullptr; + hcdnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_HCDNN_API + infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info, + hcdnnDataType_t hcdnn_data_type) { + // 创建输入输出张量描述符 + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc)); + CHECK_MCDNN(hcdnnCreateTensorDescriptor(&output_desc)); + CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_desc)); + + // 构建输入输出维度(NCHW格式) + std::vector input_dims = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims = {static_cast(info.batch), + static_cast(info.channels)}; + for (size_t i = 0; i < info.ndim; ++i) { + input_dims.push_back(static_cast(info.input_dims[i])); + output_dims.push_back(static_cast(info.output_dims[i])); + } + + // 1D池化补充维度 + if (info.ndim == 1) { + input_dims.push_back(1); + output_dims.push_back(1); + } + + // 计算输入输出张量的步幅 + std::vector input_strides(input_dims.size(), 1); + std::vector output_strides(output_dims.size(), 1); + for (int i = input_dims.size() - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + output_strides[i] = output_strides[i + 1] * output_dims[i + 1]; + } + + // 设置张量描述符(NCHW格式) + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + input_desc, hcdnn_data_type, input_dims.size(), input_dims.data(), input_strides.data())); + CHECK_MCDNN(hcdnnSetTensorNdDescriptor( + output_desc, hcdnn_data_type, output_dims.size(), output_dims.data(), + output_strides.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) { + // 构建池化参数 + std::vector kernel_size, strides, pads; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_size.push_back(static_cast(info.kernel_sizes[i])); + strides.push_back(static_cast(info.strides[i])); + pads.push_back(static_cast(info.pads[i])); + } + + // 1D池化补充维度 + if (info.ndim == 1) { + kernel_size.push_back(1); + strides.push_back(1); + pads.push_back(0); + } + + // 设置最大池化描述符(确定性模式) + CHECK_MCDNN(hcdnnSetPoolingNdDescriptor( + pooling_desc, HCDNN_POOLING_MAX_DETERMINISTIC, // 确定性最大池化 + HCDNN_NOT_PROPAGATE_NAN, // 不传播NaN + kernel_size.size(), + kernel_size.data(), + pads.data(), + strides.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializehcdnnContext(MaxPoolInfo &info, + infiniDtype_t data_type) { + hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type); + CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // 最大池化通常不需要工作空间 + workspace_size = 0; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) +#ifdef ENABLE_HCDNN_API + , + input_desc(other.input_desc), output_desc(other.output_desc), pooling_desc(other.pooling_desc) +#endif + { +#ifdef ENABLE_HCDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_HCDNN_API + CLEANUP_hcdnn_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + MaxPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_HCDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializehcdnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_HCDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + +#ifdef ENABLE_HCDNN_API + const float alpha = 1.0f, beta = 0.0f; + + // 执行最大池化前向计算 + CHECK_STATUS(_opaque->internal->useMcdnn( + (hcStream_t)stream, [&](hcdnnHandle_t handle) { + CHECK_MCDNN(hcdnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::maxpool::metax diff --git a/src/infiniop/ops/maxpool/metax/maxpool_metax.h b/src/infiniop/ops/maxpool/metax/maxpool_metax.h new file mode 100644 index 000000000..5051358de --- /dev/null +++ b/src/infiniop/ops/maxpool/metax/maxpool_metax.h @@ -0,0 +1,8 @@ +#ifndef __MAX_POOL_METAX_H__ +#define __MAX_POOL_METAX_H__ + +#include "../maxpool.h" + +DESCRIPTOR(metax) + +#endif // __MAX_POOL_METAX_CUH__ diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu new file mode 100644 index 000000000..8b94a29c1 --- /dev/null +++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu @@ -0,0 +1,240 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "maxpool_nvidia.cuh" + +#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \ + do { \ + if (desc_ptr) { \ + destroy_func(desc_ptr); \ + desc_ptr = nullptr; \ + } \ + } while (0) + +#define CLEANUP_CUDNN_DESCRIPTORS() \ + do { \ + DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor); \ + DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \ + } while (0) + +namespace op::maxpool::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; + size_t workspace_size = 0; + +#ifdef ENABLE_CUDNN_API + cudnnTensorDescriptor_t input_desc = nullptr; + cudnnTensorDescriptor_t output_desc = nullptr; + cudnnPoolingDescriptor_t pooling_desc = nullptr; +#endif + +private: + Opaque(std::shared_ptr internal_ptr) + : internal(internal_ptr) {} + +#ifdef ENABLE_CUDNN_API + infiniStatus_t getCudnnDataType(infiniDtype_t data_type, + cudnnDataType_t &cudnn_data_type) const { + if (data_type == INFINI_DTYPE_F16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_F32) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else if (data_type == INFINI_DTYPE_BF16) { + cudnn_data_type = device::nvidia::getCudnnDtype(data_type); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info, + cudnnDataType_t cudnn_data_type) { + // Create CUDNN descriptors + CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc)); + CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc)); + CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc)); + + // Setup tensor descriptors + std::vector input_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + std::vector output_dims_vec = {static_cast(info.batch), + static_cast(info.channels)}; + + for (size_t i = 0; i < info.ndim; ++i) { + input_dims_vec.push_back(static_cast(info.input_dims[i])); + output_dims_vec.push_back(static_cast(info.output_dims[i])); + } + + if (info.ndim == 1) { + // For 1D pooling, add dummy dimension + input_dims_vec.push_back(1); + output_dims_vec.push_back(1); + } + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(), + input_dims_vec.data())); + + CHECK_CUDNN(cudnnSetTensorNdDescriptorEx( + output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(), + output_dims_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) { + // Setup pooling descriptor + std::vector kernel_vec, stride_vec, pad_vec; + for (size_t i = 0; i < info.ndim; ++i) { + kernel_vec.push_back(static_cast(info.kernel_sizes[i])); + stride_vec.push_back(static_cast(info.strides[i])); + pad_vec.push_back(static_cast(info.pads[i])); + } + + if (info.ndim == 1) { + // For 1D pooling, add dummy dimension + kernel_vec.push_back(1); + stride_vec.push_back(1); + pad_vec.push_back(0); + } + + CHECK_CUDNN(cudnnSetPoolingNdDescriptor( + pooling_desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, + kernel_vec.size(), kernel_vec.data(), pad_vec.data(), + stride_vec.data())); + + return INFINI_STATUS_SUCCESS; + } + + infiniStatus_t initializeCudnnContext(MaxPoolInfo &info, + infiniDtype_t data_type) { + cudnnDataType_t cudnn_data_type; + CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type)); + + CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type)); + CHECK_STATUS(setupPoolingDescriptor(info)); + + // Max pooling typically doesn't need workspace + workspace_size = 0; + + return INFINI_STATUS_SUCCESS; + } +#endif + +public: + Opaque(Opaque &&other) noexcept + : internal(std::move(other.internal)), + workspace_size(other.workspace_size) + // clang-format off +#ifdef ENABLE_CUDNN_API + , input_desc(other.input_desc) + , output_desc(other.output_desc) + , pooling_desc(other.pooling_desc) +#endif + // clang-format on + { +#ifdef ENABLE_CUDNN_API + other.input_desc = nullptr; + other.output_desc = nullptr; + other.pooling_desc = nullptr; +#endif + other.workspace_size = 0; + } + + ~Opaque() { +#ifdef ENABLE_CUDNN_API + CLEANUP_CUDNN_DESCRIPTORS(); +#endif + } + + static inline utils::Result + create(std::shared_ptr internal_ptr, + MaxPoolInfo &info, infiniDtype_t data_type) { +#ifdef ENABLE_CUDNN_API + Opaque opaque(internal_ptr); + auto status = opaque.initializeCudnnContext(info, data_type); + if (status != INFINI_STATUS_SUCCESS) { + return status; + } + return utils::Result(std::move(opaque)); +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif + } +}; + +Descriptor::~Descriptor() { + if (_opaque) { + delete _opaque; + } +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, void *strides, void *pads, + bool ceil_mode) { + +#ifdef ENABLE_CUDNN_API + auto handle = reinterpret_cast(handle_); + auto dtype = input_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + + auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size, + strides, pads, ceil_mode); + CHECK_RESULT(result); + auto info = result.take(); + + auto opaque_result = Opaque::create(handle->internal(), info, dtype); + CHECK_RESULT(opaque_result); + auto opaque = new Opaque(opaque_result.take()); + + *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size, + opaque, handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *output, const void *input, + void *stream) const { + +#ifdef ENABLE_CUDNN_API + const float alpha = 1.0f, beta = 0.0f; + + // 打印input展平后的前十个数据 + // printf("MaxPool input (first 10 elements): "); + // const uint16_t *input_data = static_cast(input); + // for (int i = 0; i < 10; ++i) { + // // 将BF16转换为float显示 + // union { + // uint32_t bits; + // float value; + // } converter; + // uint16_t bf16_val = input_data[i]; + // converter.bits = static_cast(bf16_val) << 16; + // printf("%f ", converter.value); + // } + // printf("\n"); + + CHECK_STATUS(_opaque->internal->useCudnn( + (cudaStream_t)stream, [&](cudnnHandle_t handle) { + CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha, + _opaque->input_desc, input, &beta, + _opaque->output_desc, output)); + return INFINI_STATUS_SUCCESS; + })); + + return INFINI_STATUS_SUCCESS; +#else + return INFINI_STATUS_NOT_IMPLEMENTED; +#endif +} + +} // namespace op::maxpool::nvidia diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh new file mode 100644 index 000000000..539ad5a1a --- /dev/null +++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __MAX_POOL_CUDA_CUH__ +#define __MAX_POOL_CUDA_CUH__ + +#include "../maxpool.h" + +DESCRIPTOR(nvidia) + +#endif // __MAX_POOL_CUDA_CUH__ diff --git a/src/infiniop/ops/maxpool/operator.cc b/src/infiniop/ops/maxpool/operator.cc new file mode 100644 index 000000000..aedfc0585 --- /dev/null +++ b/src/infiniop/ops/maxpool/operator.cc @@ -0,0 +1,155 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/maxpool.h" + +#ifdef ENABLE_CPU_API +#include "cpu/maxpool_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/maxpool_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/maxpool_metax.h" +#endif + +__C infiniStatus_t infiniopCreateMaxPoolDescriptor( + infiniopHandle_t handle, + infiniopMaxPoolDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + void *kernel_size, + void *strides, + void *pads, + bool ceil_mode) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::maxpool::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + kernel_size, \ + strides, \ + pads, \ + ceil_mode) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize( + infiniopMaxPoolDescriptor_t desc, + size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef GET +} + +__C infiniStatus_t infiniopMaxPool( + infiniopMaxPoolDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, \ + output, \ + input, \ + stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc new file mode 100644 index 000000000..55a340226 --- /dev/null +++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc @@ -0,0 +1,102 @@ +#include "reduce_max_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" + +namespace op::reduce_max::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto result = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { + const size_t cols = info->shape[3]; // 规约维度 + const ptrdiff_t y_batch_stride = info->y_strides[0]; + const ptrdiff_t y_channel_stride = info->y_strides[1]; + const ptrdiff_t y_row_stride = info->y_strides[2]; + const ptrdiff_t x_batch_stride = info->x_strides[0]; + const ptrdiff_t x_channel_stride = info->x_strides[1]; + const ptrdiff_t x_row_stride = info->x_strides[2]; + const ptrdiff_t x_col_stride = info->x_strides[3]; +#ifdef _WIN32 + const ptrdiff_t batch_size = static_cast(info->shape[0]); + const ptrdiff_t channels = static_cast(info->shape[1]); + const ptrdiff_t rows = static_cast(info->shape[2]); +#pragma omp parallel for + for (ptrdiff_t batch = 0; batch < batch_size; ++batch) { + for (ptrdiff_t channel = 0; channel < channels; ++channel) { + for (ptrdiff_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride); + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(max); + } else { + *output_ptr = max; + } + } + } + } +#else + const size_t batch_size = info->shape[0]; + const size_t channels = info->shape[1]; + const size_t rows = info->shape[2]; +#pragma omp parallel for collapse(3) + for (size_t batch = 0; batch < batch_size; ++batch) { + for (size_t channel = 0; channel < channels; ++channel) { + for (size_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride); + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(max); + } else { + *output_ptr = max; + } + } + } + } +#endif //_WIN32 + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(reduce_max(&_info, (fp16_t *)y, (const fp16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(reduce_max(&_info, (bf16_t *)y, (const bf16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(reduce_max(&_info, (float *)y, (const float *)x)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reduce_max::cpu diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h new file mode 100644 index 000000000..bf6be9b74 --- /dev/null +++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h @@ -0,0 +1,7 @@ +#ifndef __REDUCE_MAX_CPU_H__ +#define __REDUCE_MAX_CPU_H__ +#include "../reduce_max.h" + +DESCRIPTOR(cpu) + +#endif diff --git a/src/infiniop/ops/reduce_max/info.h b/src/infiniop/ops/reduce_max/info.h new file mode 100644 index 000000000..99bb5be19 --- /dev/null +++ b/src/infiniop/ops/reduce_max/info.h @@ -0,0 +1,62 @@ +#ifndef __REDUCE_MAX_INFO_H__ +#define __REDUCE_MAX_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::reduce_max { + +class ReduceMaxInfo { + ReduceMaxInfo() = default; + +public: + infiniDtype_t dtype; + + std::vector shape; + std::vector y_strides; + std::vector x_strides; + + static utils::Result create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) { + auto dtype = y_desc->dtype(); + if (dtype != x_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + + size_t ndim = y_desc->ndim(); + if (x_desc->ndim() != ndim) { + CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); + if (ndim > 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } else if (ndim == 0) { + std::vector shape = {1, 1, 1, 1}; + std::vector y_strides = {0, 0, 0, 0}; + std::vector x_strides = {0, 0, 0, 0}; + return utils::Result(ReduceMaxInfo{ + dtype, shape, y_strides, x_strides}); + } else { + std::vector shape = x_desc->shape(); + std::vector y_strides = y_desc->strides(); + std::vector x_strides = x_desc->strides(); + if (dim != (shape.size() - 1)) { + std::swap(shape[dim], shape[shape.size() - 1]); + std::swap(y_strides[dim], y_strides[shape.size() - 1]); + std::swap(x_strides[dim], x_strides[shape.size() - 1]); + } + while (shape.size() < 4) { + shape.insert(shape.begin(), 1); + y_strides.insert(y_strides.begin(), 0); + x_strides.insert(x_strides.begin(), 0); + } + return utils::Result(ReduceMaxInfo{ + dtype, shape, y_strides, x_strides}); + } + } +}; + +} // namespace op::reduce_max + +#endif // __REDUCE_MAX_INFO_H__ diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h new file mode 100644 index 000000000..735bc8da4 --- /dev/null +++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MAX_METAX_H__ +#define __REDUCE_MAX_METAX_H__ + +#include "../reduce_max.h" + +DESCRIPTOR(metax) + +#endif diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca new file mode 100644 index 000000000..ab120fea5 --- /dev/null +++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca @@ -0,0 +1,96 @@ +#include "../../../devices/metax/metax_common.h" +#include "reduce_max_metax.h" + +#include +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../nvidia/kernel.cuh" + +template +INFINIOP_METAX_KERNEL ReduceMax( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMaxKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_max::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + hcStream_t stream) { + dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMax + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMax + <<>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMax + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_max::metax diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh new file mode 100644 index 000000000..fec2f9341 --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh @@ -0,0 +1,21 @@ +#ifndef __REDUCE_MAX_KERNEL_CUH__ +#define __REDUCE_MAX_KERNEL_CUH__ + +template +__device__ void ReduceMaxKernel( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + + Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h; + const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; + + // [Reduce] Find the max of each updated row and store in shared memory + Tcompute max_0 = op::common_cuda::reduce_op::max(x, width, x_stride_w); + if (threadIdx.x == 0) { + *y = max_0; + } +} + +#endif // __REDUCE_MAX_KERNEL_CUH__ diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu new file mode 100644 index 000000000..f64f596f4 --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu @@ -0,0 +1,101 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "reduce_max_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "kernel.cuh" + +template +INFINIOP_CUDA_KERNEL ReduceMax( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMaxKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_max::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + cudaStream_t stream) { + dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMax + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMax + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMax + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_max::nvidia diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh new file mode 100644 index 000000000..388738c27 --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MAX_NVIDIA_H__ +#define __REDUCE_MAX_NVIDIA_H__ + +#include "../reduce_max.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/reduce_max/operator.cc b/src/infiniop/ops/reduce_max/operator.cc new file mode 100644 index 000000000..ad7b33393 --- /dev/null +++ b/src/infiniop/ops/reduce_max/operator.cc @@ -0,0 +1,181 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reduce_max.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reduce_max_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/reduce_max_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/reduce_max_metax.h" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/reduce_max_ascend.h" +#endif + +__C infiniStatus_t infiniopCreateReduceMaxDescriptor( + infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + size_t dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reduce_max::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + dim); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // CREATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim); + // // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // GET(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size); + // // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReduceMax( + infiniopReduceMaxDescriptor_t desc, + void *workspace, size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, output, input, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // CALCULATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream); + // // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // DESTROY(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc); + // // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: + // return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc); + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/reduce_max/reduce_max.h b/src/infiniop/ops/reduce_max/reduce_max.h new file mode 100644 index 000000000..c6edd7365 --- /dev/null +++ b/src/infiniop/ops/reduce_max/reduce_max.h @@ -0,0 +1,47 @@ +#ifndef REDUCE_MAX_H +#define REDUCE_MAX_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::reduce_max::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ReduceMaxInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + ReduceMaxInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + size_t dim); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // REDUCE_MAX_H diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc new file mode 100644 index 000000000..7853afdbb --- /dev/null +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc @@ -0,0 +1,102 @@ +#include "reduce_mean_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" + +namespace op::reduce_mean::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto result = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { + const size_t cols = info->shape[3]; // 规约维度 + const ptrdiff_t y_batch_stride = info->y_strides[0]; + const ptrdiff_t y_channel_stride = info->y_strides[1]; + const ptrdiff_t y_row_stride = info->y_strides[2]; + const ptrdiff_t x_batch_stride = info->x_strides[0]; + const ptrdiff_t x_channel_stride = info->x_strides[1]; + const ptrdiff_t x_row_stride = info->x_strides[2]; + const ptrdiff_t x_col_stride = info->x_strides[3]; +#ifdef _WIN32 + const ptrdiff_t batch_size = static_cast(info->shape[0]); + const ptrdiff_t channels = static_cast(info->shape[1]); + const ptrdiff_t rows = static_cast(info->shape[2]); +#pragma omp parallel for + for (ptrdiff_t batch = 0; batch < batch_size; ++batch) { + for (ptrdiff_t channel = 0; channel < channels; ++channel) { + for (ptrdiff_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(mean); + } else { + *output_ptr = mean; + } + } + } + } +#else + const size_t batch_size = info->shape[0]; + const size_t channels = info->shape[1]; + const size_t rows = info->shape[2]; +#pragma omp parallel for collapse(3) + for (size_t batch = 0; batch < batch_size; ++batch) { + for (size_t channel = 0; channel < channels; ++channel) { + for (size_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(mean); + } else { + *output_ptr = mean; + } + } + } + } +#endif //_WIN32 + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(reduce_mean(&_info, (fp16_t *)y, (const fp16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(reduce_mean(&_info, (bf16_t *)y, (const bf16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(reduce_mean(&_info, (float *)y, (const float *)x)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reduce_mean::cpu diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h new file mode 100644 index 000000000..f67601c62 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h @@ -0,0 +1,7 @@ +#ifndef __REDUCE_MEAN_CPU_H__ +#define __REDUCE_MEAN_CPU_H__ +#include "../reduce_mean.h" + +DESCRIPTOR(cpu) + +#endif diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h new file mode 100644 index 000000000..6c11e07d3 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/info.h @@ -0,0 +1,62 @@ +#ifndef __REDUCE_MEAN_INFO_H__ +#define __REDUCE_MEAN_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::reduce_mean { + +class ReduceMeanInfo { + ReduceMeanInfo() = default; + +public: + infiniDtype_t dtype; + + std::vector shape; + std::vector y_strides; + std::vector x_strides; + + static utils::Result create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) { + auto dtype = y_desc->dtype(); + if (dtype != x_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + + size_t ndim = y_desc->ndim(); + if (x_desc->ndim() != ndim) { + CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); + if (ndim > 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } else if (ndim == 0) { + std::vector shape = {1, 1, 1, 1}; + std::vector y_strides = {0, 0, 0, 0}; + std::vector x_strides = {0, 0, 0, 0}; + return utils::Result(ReduceMeanInfo{ + dtype, shape, y_strides, x_strides}); + } else { + std::vector shape = x_desc->shape(); + std::vector y_strides = y_desc->strides(); + std::vector x_strides = x_desc->strides(); + if (dim != (shape.size() - 1)) { + std::swap(shape[dim], shape[shape.size() - 1]); + std::swap(y_strides[dim], y_strides[shape.size() - 1]); + std::swap(x_strides[dim], x_strides[shape.size() - 1]); + } + while (shape.size() < 4) { + shape.insert(shape.begin(), 1); + y_strides.insert(y_strides.begin(), 0); + x_strides.insert(x_strides.begin(), 0); + } + return utils::Result(ReduceMeanInfo{ + dtype, shape, y_strides, x_strides}); + } + } +}; + +} // namespace op::reduce_mean + +#endif // __REDUCE_MEAN_INFO_H__ diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h new file mode 100644 index 000000000..a105724d6 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MEAN_METAX_H__ +#define __REDUCE_MEAN_METAX_H__ + +#include "../reduce_mean.h" + +DESCRIPTOR(metax) + +#endif diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca new file mode 100644 index 000000000..0eaf1c9cb --- /dev/null +++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca @@ -0,0 +1,96 @@ +#include "../../../devices/metax/metax_common.h" +#include "reduce_mean_metax.h" + +#include +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../nvidia/kernel.cuh" + +template +INFINIOP_METAX_KERNEL ReduceMean( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMeanKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_mean::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + hcStream_t stream) { + dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMean + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMean + <<>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMean + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_mean::metax diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh new file mode 100644 index 000000000..e70748605 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh @@ -0,0 +1,26 @@ +#ifndef __REDUCE_MEAN_KERNEL_CUH__ +#define __REDUCE_MEAN_KERNEL_CUH__ + +template +__device__ void ReduceMeanKernel( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + + Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h; + const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; + + // [Reduce] Find the sum of each updated row and store in shared memory + Tcompute sum_0 = op::common_cuda::reduce_op::sum(x, width, x_stride_w); + if (threadIdx.x == 0) { + // mean_ = sum_0/width; + *y = sum_0 / width; + } + // __syncthreads(); + + // [Elementwise] Divide each element by the sum and store in shared memory + // *y = mean_; +} + +#endif // __REDUCE_MEAN_KERNEL_CUH__ diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu new file mode 100644 index 000000000..bfc26e23d --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu @@ -0,0 +1,101 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "reduce_mean_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "kernel.cuh" + +template +INFINIOP_CUDA_KERNEL ReduceMean( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMeanKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_mean::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + cudaStream_t stream) { + dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMean + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMean + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMean + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_mean::nvidia diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh new file mode 100644 index 000000000..be16b4491 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MEAN_NVIDIA_H__ +#define __REDUCE_MEAN_NVIDIA_H__ + +#include "../reduce_mean.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/reduce_mean/operator.cc b/src/infiniop/ops/reduce_mean/operator.cc new file mode 100644 index 000000000..7c88e4e4d --- /dev/null +++ b/src/infiniop/ops/reduce_mean/operator.cc @@ -0,0 +1,182 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reduce_mean.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reduce_mean_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/reduce_mean_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/reduce_mean_metax.h" +#endif +// #ifdef ENABLE_ASCEND_API +// #include "ascend/reduce_mean_ascend.h" +// #endif + +__C infiniStatus_t infiniopCreateReduceMeanDescriptor( + infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reduce_mean::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + dim); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // CREATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangCreateReduceMeanDescriptor((BangHandle_t)handle, (ReduceMeanBangDescriptor_t *)desc_ptr, y_desc); + // // return cnnlCreateReduceMeanDescriptor((BangHandle_t) handle, (ReduceMeanCnnlDescriptor_t *) desc_ptr, y_desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaCreateReduceMeanDescriptor((MusaHandle_t)handle, (ReduceMeanMusaDescriptor_t *)desc_ptr, y_desc); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // GET(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangGetReduceMeanWorkspaceSize((ReduceMeanBangDescriptor_t)desc, size); + // // return cnnlGetReduceMeanWorkspaceSize((ReduceMeanCnnlDescriptor_t) desc, size); + // } + + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaGetReduceMeanWorkspaceSize((ReduceMeanMusaDescriptor_t)desc, size); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReduceMean( + infiniopReduceMeanDescriptor_t desc, + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, y, x, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // CALCULATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangReduceMean((ReduceMeanBangDescriptor_t)desc, workspace, workspace_size, data, stream); + // // return cnnlReduceMean((ReduceMeanCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaReduceMean((ReduceMeanMusaDescriptor_t)desc, workspace, workspace_size, data, stream); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // DESTROY(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangDestroyReduceMeanDescriptor((ReduceMeanBangDescriptor_t)desc); + // // return cnnlDestroyReduceMeanDescriptor((ReduceMeanCnnlDescriptor_t) desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: + // return musaDestroyReduceMeanDescriptor((ReduceMeanMusaDescriptor_t)desc); + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/reduce_mean/reduce_mean.h b/src/infiniop/ops/reduce_mean/reduce_mean.h new file mode 100644 index 000000000..bf2e2dda0 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/reduce_mean.h @@ -0,0 +1,47 @@ +#ifndef REDUCE_MEAN_H +#define REDUCE_MEAN_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::reduce_mean::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ReduceMeanInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + ReduceMeanInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + size_t dim); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +#endif // REDUCE_MEAN_H diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc new file mode 100644 index 000000000..c47a38ef9 --- /dev/null +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc @@ -0,0 +1,108 @@ +#include "scatter_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::scatter::cpu { + +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + char * output, + const char * input, + const int64_t * index +) { +// -------------------------------- start: perform operator on CPU -------------------------------- + std::vector contiguous_strides(info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for(size_t d = 0; d < info.ndim; d ++) + { + if (d == info.dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t batch_size = last_dim * last_stride; + int scatter_dim = int(info.dim); + size_t element_size = infiniSizeOf(info.dtype); + + #pragma omp parallel for + for (size_t n = 0; n < batch_size; n ++) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = n; + for(int d = info.ndim - 1; d >= 0; d --) { + if (d == scatter_dim) + continue; + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * element_size * info.output_strides[d]; + input_ptr += dim_index * element_size * info.input_strides[d]; + index_ptr += dim_index * info.index_strides[d]; + } + for (size_t c = 0; c < info.index_shape[scatter_dim]; c ++) { + int64_t scatter_number = *(index_ptr + c * info.index_strides[scatter_dim]); + memcpy( + output_ptr + scatter_number * element_size * info.output_strides[scatter_dim], + input_ptr + c * element_size * info.input_strides[scatter_dim], + element_size + ); + } + } + +// --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = input_desc->dtype(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) const { + + return calculate_scatter(_info, (char *)output, (const char *)input, (const int64_t *)index); +} +} diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.h b/src/infiniop/ops/scatter/cpu/scatter_cpu.h new file mode 100644 index 000000000..bf2fcb7a1 --- /dev/null +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.h @@ -0,0 +1,9 @@ +#ifndef __SCATTER_CPU_H__ +#define __SCATTER_CPU_H__ + +#include "../scatter.h" + +DESCRIPTOR(cpu) + + +#endif // __SCATTER_CPU_H__ diff --git a/src/infiniop/ops/scatter/cuda/kernel.cuh b/src/infiniop/ops/scatter/cuda/kernel.cuh new file mode 100644 index 000000000..6c4de6ce5 --- /dev/null +++ b/src/infiniop/ops/scatter/cuda/kernel.cuh @@ -0,0 +1,38 @@ +#ifndef __SCATTER_KERNEL_CUH__ +#define __SCATTER_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void scatterKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int scatter_dim +) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = blockIdx.x; + for(int d = ndim - 1; d >= 0; d --) { + if (d == scatter_dim) + continue; + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * output_strides[d]; + input_ptr += dim_index * input_strides[d]; + index_ptr += dim_index * index_strides[d]; + } + for (size_t c = threadIdx.x; c < index_scatter_size; c += BLOCK_SIZE) { + int64_t scatter_number = *(index_ptr + c * index_strides[scatter_dim]); + *(output_ptr + scatter_number * output_strides[scatter_dim]) = \ + *(input_ptr + c * input_strides[scatter_dim]); + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __SCATTER_KERNEL_CUH__ diff --git a/src/infiniop/ops/scatter/info.h b/src/infiniop/ops/scatter/info.h new file mode 100644 index 000000000..9f21e435c --- /dev/null +++ b/src/infiniop/ops/scatter/info.h @@ -0,0 +1,67 @@ +#ifndef __SCATTER_INFO_H__ +#define __SCATTER_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::scatter { + +class ScatterInfo { +private: + ScatterInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t ndim; + std::vector output_shape; + std::vector input_shape; + std::vector index_shape; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + size_t dim; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createScatterInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN( + input_desc->ndim() == output_desc->ndim() && output_desc->ndim() == index_desc->ndim(), + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + size_t ndim = output_desc->ndim(); + for (size_t d = 0; d < ndim; d ++){ + if(d != dim) { + CHECK_OR_RETURN( + index_desc->dim(d) <= input_desc->dim(d) && index_desc->dim(d) <= output_desc->dim(d), + INFINI_STATUS_BAD_TENSOR_SHAPE; + ); + } + } + CHECK_OR_RETURN(index_desc->dim(dim) <= input_desc->dim(dim), INFINI_STATUS_BAD_TENSOR_SHAPE); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(ScatterInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + ndim, + output_desc->shape(), + input_desc->shape(), + index_desc->shape(), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + dim +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __SCATTER_INFO_H__ diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.h b/src/infiniop/ops/scatter/metax/scatter_metax.h new file mode 100644 index 000000000..d5ce0ef16 --- /dev/null +++ b/src/infiniop/ops/scatter/metax/scatter_metax.h @@ -0,0 +1,8 @@ +#ifndef __SCATTER_METAX_H__ +#define __SCATTER_METAX_H__ + +#include "../scatter.h" + +DESCRIPTOR(metax) + +#endif // __SCATTER_METAX_H__ diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.maca b/src/infiniop/ops/scatter/metax/scatter_metax.maca new file mode 100644 index 000000000..1c742f60d --- /dev/null +++ b/src/infiniop/ops/scatter/metax/scatter_metax.maca @@ -0,0 +1,190 @@ +#include "../../../devices/metax/metax_common.h" +#include "scatter_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::scatter::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int scatter_dim +) { + scatterKernel( + output, + input, + index, + ndim, + index_scatter_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + scatter_dim + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + hcStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t scatter_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == scatter_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + + size_t batch_size = last_dim * last_stride; + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + + + launchKernel<<>>( + output, + input, + index, + ndim, + info.index_shape[scatter_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + scatter_dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \ + calculate_scatter(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + + #define CALCULATE_SCATTER_WITH_METAX_BLOCK(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_SCATTER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_SCATTER_WITH_METAX_BLOCK + #undef CALCULATE_SCATTER +} +} // namespace op::scatter::metax diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu new file mode 100644 index 000000000..7d6e1a1a1 --- /dev/null +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu @@ -0,0 +1,190 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "scatter_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::scatter::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int scatter_dim +) { + scatterKernel( + output, + input, + index, + ndim, + index_scatter_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + scatter_dim + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + cudaStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t scatter_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == scatter_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + + size_t batch_size = last_dim * last_stride; + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<<>>( + output, + input, + index, + ndim, + info.index_shape[scatter_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + scatter_dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \ + calculate_scatter(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + #define CALCULATE_SCATTER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_SCATTER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + #undef CALCULATE_SCATTER_WITH_BLOCK_SIZE + #undef CALCULATE_SCATTER + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::scatter::nvidia diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh new file mode 100644 index 000000000..a199edb6e --- /dev/null +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __SCATTER_NVIDIA_API_H__ +#define __SCATTER_NVIDIA_API_H__ +#include "../scatter.h" + +DESCRIPTOR(nvidia) + +#endif // __SCATTER_NVIDIA_API_H__ diff --git a/src/infiniop/ops/scatter/operator.cc b/src/infiniop/ops/scatter/operator.cc new file mode 100644 index 000000000..7c7de71b5 --- /dev/null +++ b/src/infiniop/ops/scatter/operator.cc @@ -0,0 +1,154 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/scatter.h" + +#ifdef ENABLE_CPU_API +#include "cpu/scatter_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/scatter_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/scatter_metax.h" +#endif + +__C infiniStatus_t infiniopCreateScatterDescriptor( + infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::scatter::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopScatter( + infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + index, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/scatter/scatter.h b/src/infiniop/ops/scatter/scatter.h new file mode 100644 index 000000000..7d2deefc1 --- /dev/null +++ b/src/infiniop/ops/scatter/scatter.h @@ -0,0 +1,49 @@ +#ifndef __SCATTER_H__ +#define __SCATTER_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::scatter::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ScatterInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + ScatterInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + const void * input, \ + const void * index, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc new file mode 100644 index 000000000..88ba6cdd6 --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc @@ -0,0 +1,52 @@ +#include "sin_cpu.h" + +namespace op::sin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::cpu diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h new file mode 100644 index 000000000..80e406f98 --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.h @@ -0,0 +1,21 @@ +#ifndef __SIN_CPU_H__ +#define __SIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(sin, cpu) + +namespace op::sin::cpu { +typedef struct SinOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::sin(input); + } +} SinOp; +} // namespace op::sin::cpu + +#endif // __SIN_CPU_H__ diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh new file mode 100644 index 000000000..30641366c --- /dev/null +++ b/src/infiniop/ops/sin/cuda/kernel.cuh @@ -0,0 +1,39 @@ +#ifndef __SIN_CUDA_H__ +#define __SIN_CUDA_H__ + +#include +#include +#include + +namespace op::sin::cuda { +typedef struct SinOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(sinf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__sinf(f0), __sinf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__sinf(inputf)); + } else if constexpr (std::is_same_v) { + return sinf(input); + } else if constexpr (std::is_same_v) { + return std::sin(input); + } else { + return std::sin(input); + } + } +} SinOp; +} // namespace op::sin::cuda + +#endif // __SIN_CUDA_H__ diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h new file mode 100644 index 000000000..5b272d4d9 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.h @@ -0,0 +1,8 @@ +#ifndef __SIN_METAX_API_H__ +#define __SIN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(sin, metax) + +#endif // __SIN_METAX_API_H__ diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca new file mode 100644 index 000000000..5ea69e139 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.maca @@ -0,0 +1,60 @@ +#include "sin_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::sin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::metax diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu new file mode 100644 index 000000000..6fbf952bc --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sin_nvidia.cuh" + +namespace op::sin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::nvidia diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh new file mode 100644 index 000000000..31f5b48ef --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIN_CUDA_API_H__ +#define __SIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sin, nvidia) + +#endif // __SIN_CUDA_API_H__ diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc new file mode 100644 index 000000000..978561a04 --- /dev/null +++ b/src/infiniop/ops/sin/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sin_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSinDescriptor( + infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSin( + infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/softplus/operator.cc b/src/infiniop/ops/softplus/operator.cc index 2548f7d34..96c71b6f3 100644 --- a/src/infiniop/ops/softplus/operator.cc +++ b/src/infiniop/ops/softplus/operator.cc @@ -8,9 +8,9 @@ #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) #include "nvidia/softplus_nvidia.cuh" #endif -#ifdef ENABLE_METAX_API -#include "metax/softplus_metax.h" -#endif +// #ifdef ENABLE_METAX_API +// #include "metax/softplus_metax.h" +// #endif __C infiniStatus_t infiniopCreateSoftplusDescriptor( infiniopHandle_t handle, diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc new file mode 100644 index 000000000..23a92ed65 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc @@ -0,0 +1,52 @@ +#include "tanh_cpu.h" + +namespace op::tanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::cpu diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h new file mode 100644 index 000000000..73fd7c1b6 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h @@ -0,0 +1,21 @@ +#ifndef __TANH_CPU_H__ +#define __TANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(tanh, cpu) + +namespace op::tanh::cpu { +typedef struct TanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::tanh(input); + } +} TanhOp; +} // namespace op::tanh::cpu + +#endif // __TANH_CPU_H__ diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh new file mode 100644 index 000000000..62979a20e --- /dev/null +++ b/src/infiniop/ops/tanh/cuda/kernel.cuh @@ -0,0 +1,46 @@ +#ifndef __TANH_CUDA_H__ +#define __TANH_CUDA_H__ + +#include +#include +#include + +namespace op::tanh::cuda { +typedef struct TanhOp { + static constexpr size_t num_inputs = 1; + + __device__ __forceinline__ float tanh_f32_func(float x) const { + return tanhf(x); + } + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = tanh_f32_func(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + float r0 = tanh_f32_func(f0); + float r1 = tanh_f32_func(f1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + float rf = tanh_f32_func(xf); + return __float2bfloat16_rn(rf); + } else if constexpr (std::is_same_v) { + return tanh_f32_func(input); + } else if constexpr (std::is_same_v) { + return std::tanh(input); + } else { + return std::tanh(input); + } + } +} TanhOp; +} // namespace op::tanh::cuda + +#endif // __TANH_CUDA_H__ diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h new file mode 100644 index 000000000..8432a7f0d --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.h @@ -0,0 +1,8 @@ +#ifndef __TANH_METAX_API_H__ +#define __TANH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tanh, metax) + +#endif // __TANH_METAX_API_H__ diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca new file mode 100644 index 000000000..0a01554c4 --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca @@ -0,0 +1,60 @@ +#include "tanh_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::tanh::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::metax diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu new file mode 100644 index 000000000..a2c36551c --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tanh_nvidia.cuh" + +namespace op::tanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::nvidia diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh new file mode 100644 index 000000000..cb37b2528 --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TANH_CUDA_API_H__ +#define __TANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tanh, nvidia) + +#endif // __TANH_CUDA_API_H__ diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc new file mode 100644 index 000000000..d34d97df6 --- /dev/null +++ b/src/infiniop/ops/tanh/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tanh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tanh_metax.h" +#endif + +__C infiniStatus_t infiniopCreateTanhDescriptor( + infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTanh( + infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc new file mode 100644 index 000000000..de7e86e3e --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.cc @@ -0,0 +1,84 @@ +#include "where_cpu.h" + +namespace op::where::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::cpu diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h new file mode 100644 index 000000000..3d86cb4f7 --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.h @@ -0,0 +1,19 @@ +#ifndef __WHERE_CPU_H__ +#define __WHERE_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(where, cpu) + +namespace op::where::cpu { +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + template + T operator()(const T &a, const T &b, const bool &cond) const { + return cond ? a : b; + } +} WhereOp; +} // namespace op::where::cpu + +#endif // __WHERE_CPU_H__ diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh new file mode 100644 index 000000000..8eb5c762b --- /dev/null +++ b/src/infiniop/ops/where/cuda/kernel.cuh @@ -0,0 +1,15 @@ +#ifndef __WHERE_CUDA_H__ +#define __WHERE_CUDA_H__ + +namespace op::where::cuda { +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + template + __device__ __forceinline__ T operator()(const T &a, const T &b, const bool &cond) const { + return cond ? a : b; + } +} WhereOp; +} // namespace op::where::cuda + +#endif // __WHERE_CUDA_H__ diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h new file mode 100644 index 000000000..43bb1a945 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.h @@ -0,0 +1,8 @@ +#ifndef __WHERE_METAX_API_H__ +#define __WHERE_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(where, metax) + +#endif // __WHERE_METAX_API_H__ diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca new file mode 100644 index 000000000..b648cfbcc --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.maca @@ -0,0 +1,92 @@ +#include "where_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::where::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::metax diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu new file mode 100644 index 000000000..860089bd2 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu @@ -0,0 +1,91 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "where_nvidia.cuh" + +namespace op::where::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::nvidia diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh new file mode 100644 index 000000000..c168364a8 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __WHERE_CUDA_API_H__ +#define __WHERE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(where, nvidia) + +#endif // __WHERE_CUDA_API_H__ diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc new file mode 100644 index 000000000..d69b1d4e1 --- /dev/null +++ b/src/infiniop/ops/where/operator.cc @@ -0,0 +1,148 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/where.h" + +#ifdef ENABLE_CPU_API +#include "cpu/where_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/where_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/where_metax.h" +#endif + +__C infiniStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t condition_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::where::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc, \ + condition_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b, condition}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh index a1d2c2501..cce47d459 100644 --- a/src/infiniop/reduce/cuda/reduce.cuh +++ b/src/infiniop/reduce/cuda/reduce.cuh @@ -61,6 +61,40 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) { return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE); } +// Sum(x) on non-contiguous data of length count +template +__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, + size_t count, + ptrdiff_t stride) { + Tcompute s = 0; + + for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { + s += Tcompute(data_ptr[i * stride]); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + return BlockReduce(temp_storage).Sum(s); +} + +// Max(x) on non-contiguous data of length count +template +__device__ __forceinline__ Tdata max(const Tdata *data_ptr, + size_t count, + ptrdiff_t stride) { + Tdata max_ = data_ptr[0]; + + for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { + max_ = cub::Max()(max_, data_ptr[i * stride]); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE); +} + } // namespace op::common_cuda::reduce_op #endif diff --git a/src/utils/check.h b/src/utils/check.h index 7f4a2bdd9..76823aa40 100644 --- a/src/utils/check.h +++ b/src/utils/check.h @@ -59,4 +59,22 @@ #define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__) +#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE) \ + do { \ + if (INPUT_SHAPE.empty()) { \ + if (!EXPECTED_SHAPE.empty()) { \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } \ + break; \ + } \ + if (DIM >= INPUT_SHAPE.size()) { \ + return INFINI_STATUS_BAD_PARAM; \ + } \ + std::vector reduced_shape = INPUT_SHAPE; \ + reduced_shape[DIM] = 1; \ + if (reduced_shape != EXPECTED_SHAPE) { \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } \ + } while (0) + #endif // INFINIUTILS_CHECK_H diff --git a/test/infiniop/averagepool.py b/test/infiniop/averagepool.py new file mode 100644 index 000000000..55d5c37cf --- /dev/null +++ b/test/infiniop/averagepool.py @@ -0,0 +1,239 @@ +import torch +import ctypes +from ctypes import c_uint64, c_bool + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from typing import Tuple +import math +from torch.nn import functional as F + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +_TEST_CASES = [ + # ============ 1D Average Pooling Tests (converted to MaxPool format) ============ + # Basic cases + ((4, 8, 128), None, (3,), (1,), (0,), False), # kernel=3, stride=1, pad=0 + ((2, 16, 256), None, (5,), (2,), (2,), False), # kernel=5, stride=2, pad=2 + ((8, 4, 64), None, (7,), (3,), (1,), False), # kernel=7, stride=3, pad=1 + # ceil_mode variations + ((1, 3, 99), None, (4,), (3,), (1,), True), # kernel=4, stride=3, pad=1 + ((3, 2, 77), None, (6,), (4,), (0,), True), # kernel=6, stride=4, pad=0 + # ============ 2D Average Pooling Tests ============ + # Basic cases with square kernels + ((2, 3, 64, 64), None, (3, 3), (1, 1), (1, 1), False), + ((4, 16, 128, 128), None, (5, 5), (2, 2), (2, 2), False), + ((1, 8, 96, 96), None, (7, 7), (3, 3), (0, 0), False), + # Rectangular kernels + ((2, 4, 80, 120), None, (3, 5), (1, 2), (1, 2), False), + ((1, 6, 72, 48), None, (7, 3), (2, 1), (3, 1), False), + ((3, 2, 56, 84), None, (2, 4), (2, 3), (0, 2), False), + # ceil_mode variations + ((1, 1, 33, 33), None, (4, 4), (3, 3), (1, 1), True), + ((2, 5, 77, 89), None, (5, 3), (4, 2), (2, 1), True), + # ============ 3D Average Pooling Tests ============ + # Basic cubic kernels + ((1, 2, 32, 32, 32), None, (3, 3, 3), (1, 1, 1), (1, 1, 1), False), + ((2, 4, 48, 48, 48), None, (5, 5, 5), (2, 2, 2), (2, 2, 2), False), + ((1, 1, 64, 64, 64), None, (7, 7, 7), (3, 3, 3), (0, 0, 0), False), + # Non-cubic kernels + ((1, 3, 24, 36, 48), None, (2, 3, 4), (1, 2, 2), (0, 1, 2), False), + ((2, 2, 40, 32, 56), None, (5, 3, 7), (2, 1, 3), (2, 1, 3), False), + ((1, 1, 28, 44, 36), None, (3, 5, 2), (2, 3, 1), (1, 2, 1), False), + # ceil_mode variations + ((1, 1, 27, 27, 27), None, (4, 4, 4), (3, 3, 3), (1, 1, 1), True), + ((2, 2, 33, 45, 39), None, (5, 3, 4), (3, 2, 3), (2, 1, 1), True), +] + +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + + +def averagepool(input_tensor, kernel_size, stride, padding, ceil_mode, output_tensor): + ndim = len(input_tensor.shape) - 2 + if ndim == 1: + result = F.avg_pool1d( + input_tensor.to(torch.float32), kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode + ) + elif ndim == 2: + result = F.avg_pool2d( + input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode + ) + elif ndim == 3: + result = F.avg_pool3d( + input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode + ) + else: + raise ValueError(f"Unsupported spatial dimensions: {ndim}") + + # 将计算结果转换回原始数据类型 + output_tensor.copy_(result.to(output_tensor.dtype)) + + +def infer_output_shape(input_shape, kernel_size, stride, padding, ceil_mode): + def calc_output_size(input_size, k, s, p, ceil_mode): + return ( + math.ceil((input_size + 2 * p - k) / s + 1) + if ceil_mode + else math.floor((input_size + 2 * p - k) / s + 1) + ) + + batch, channel, *spatial = input_shape + output_spatial = [ + calc_output_size(spatial[i], kernel_size[i], stride[i], padding[i], ceil_mode) + for i in range(len(spatial)) + ] + return (batch, channel) + tuple(output_spatial) + + +def tuple_to_void_p(py_tuple: Tuple): + arr = (ctypes.c_uint64 * len(py_tuple))(*py_tuple) + return ctypes.cast(arr, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + kernel_size, + stride, + padding, + ceil_mode, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + output_shape = infer_output_shape( + input_shape, kernel_size, stride, padding, ceil_mode + ) + output_tensor = TestTensor(output_shape, None, dt=tensor_dtype, device=device) + + print( + f"Testing AvgPool on {InfiniDeviceNames[device]} with input_shape: {input_shape}, kernel_size: {kernel_size}, stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + averagepool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + output_tensor.torch_tensor(), + ) + + if sync: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAvgPoolDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + tuple_to_void_p(kernel_size), + tuple_to_void_p(stride), + tuple_to_void_p(padding), + c_bool(ceil_mode), + ) + ) + + for tensor in [input_tensor, output_tensor]: + if tensor: + tensor.destroy_desc() + + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAvgPoolWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_averagepool(): + check_error( + LIBINFINIOP.infiniopAvgPool( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + lib_averagepool() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ), f"Mismatch for shape {input_shape}, kernel {kernel_size}" + + if PROFILE: + profile_operation( + "PyTorch", + lambda: averagepool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + output_tensor.torch_tensor(), + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", lib_averagepool, device, NUM_PRERUN, NUM_ITERATIONS + ) + + check_error(LIBINFINIOP.infiniopDestroyAvgPoolDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/batch_norm.py b/test/infiniop/batch_norm.py new file mode 100644 index 000000000..a7b46858f --- /dev/null +++ b/test/infiniop/batch_norm.py @@ -0,0 +1,244 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES_ = [ + # shape, momentum, eps + ((13, 4, 5,), 0.1, 1e-5), + ((2, 3, 4), 0.1, 1e-4), + ((15, 16, 17,), 0.2, 1e-5), + ((50, 60, 70), 0.1, 1e-4), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + + +# No implement for INPLACE + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_batch_norm( + output: torch.Tensor, + running_mean: torch.Tensor, + running_var: torch.Tensor, + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + init_running_mean: torch.Tensor, + init_running_var: torch.Tensor, + momentum: float, + eps: float +): + bn = torch.nn.BatchNorm1d( + num_features=input.shape[1], + eps=eps, + momentum=momentum, + dtype=input.dtype, + ) + bn.weight.data = weight + bn.bias.data = bias + bn.running_mean.data = init_running_mean + bn.running_var.data = init_running_var + output.copy_(bn(input).detach()) + running_mean.copy_(bn.running_mean.data) + running_var.copy_(bn.running_var.data) + + +def test( + handle, + device, + shape, momentum, eps, + inplace, + dtype, + sync=None, +): + running_mean = TestTensor( + [shape[1]], + None, + dtype, + device, + ) + running_var = TestTensor( + [shape[1]], + None, + dtype, + device, + ) + + input = TestTensor( + shape, + None, + dtype, + device, + ) + if inplace == Inplace.INPLACE: + output = input + else: + output = TestTensor( + shape, + None, + dtype, + device + ) + + weight = TestTensor( + [shape[1]], + None, + dtype, + device, + ) + bias = TestTensor( + [shape[1]], + None, + dtype, + device, + ) + + + print( + f"Testing BatchNorm on {InfiniDeviceNames[device]} with shape:{shape}, inplace:{inplace}, momentum:{momentum}, eps:{eps}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + + torch_batch_norm( + output.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(), + input.torch_tensor(), weight.torch_tensor(), bias.torch_tensor(), + running_mean.torch_tensor(), running_var.torch_tensor(), + momentum, eps + ) + + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateBatchNormDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + running_mean.descriptor, + running_var.descriptor, + input.descriptor, + weight.descriptor, + bias.descriptor, + momentum, + eps + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, running_mean, running_var, input, weight, bias]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetBatchNormWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_batch_norm(): + check_error( + LIBINFINIOP.infiniopBatchNorm( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + running_mean.data(), + running_var.data(), + input.data(), + weight.data(), + bias.data(), + None, + ) + ) + + lib_batch_norm() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + debug(running_mean.actual_tensor(), running_mean.torch_tensor(), atol=atol, rtol=rtol) + debug(running_var.actual_tensor(), running_var.torch_tensor(), atol=atol, rtol=rtol) + + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(running_mean.actual_tensor(), running_mean.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(running_var.actual_tensor(), running_var.torch_tensor(), atol=atol, rtol=rtol) + + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_batch_norm( + output.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(), + input.torch_tensor(), weight.torch_tensor(), bias.torch_tensor(), running_mean.torch_tensor(), running_var.torch_tensor(), momentum, eps + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_batch_norm(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyBatchNormDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my BatchNorm passed!\033[0m") diff --git a/test/infiniop/cross_entropy_loss.py b/test/infiniop/cross_entropy_loss.py new file mode 100644 index 000000000..acc5cadc4 --- /dev/null +++ b/test/infiniop/cross_entropy_loss.py @@ -0,0 +1,213 @@ +import torch +import ctypes +from ctypes import c_uint64 +import numpy as np + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + infiniopOperatorDescriptor_t, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + TestWorkspace, + InfiniDeviceEnum, +) +from torch.nn import functional as F + +_TEST_CASES = [ + # Single sample classification + ((10,), 10), + ((200,), 200), + # 2D: (N, C) - batch classification + ((4, 10), 10), + ((8, 5), 5), + ((16, 100), 100), + ((32, 1000), 1000), + ((64, 21), 21), + ((128, 50), 50), + # 3D: (N, C, d1) - sequence classification + ((4, 10, 5), 10), + # 4D: (N, C, d1, d2) - image segmentation + ((2, 8, 8, 8), 8), + # 5D: (N, C, d1, d2, d3) - 3D segmentation + ((3, 10, 10, 20, 30), 10), +] + +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cross_entropy_loss_pytorch(logits, target): + return F.cross_entropy(logits.double(), target.long(), reduction="mean") + + +def test( + handle, + device, + input_shape, + num_classes, + tensor_dtype=InfiniDtype.F32, + sync=None, +): + # 根据输入形状确定logits和target的形状 + if len(input_shape) == 1: + # Shape (C,) - single sample classification + logits_shape = (num_classes,) + target_shape = (1,) # 修改:使用 (1,) 而不是标量 + else: + # Shape (N, C, [d1], [d2], ...) + logits_shape = input_shape + target_shape = (input_shape[0],) + input_shape[2:] + + print( + f"Testing CrossEntropyLoss on {InfiniDeviceNames[device]} with logits_shape: {logits_shape}, target_shape: {target_shape}, dtype:{InfiniDtypeNames[tensor_dtype]}" + ) + + # 创建logits张量 + logits = TestTensor(logits_shape, None, dt=tensor_dtype, device=device) + + # 创建target张量 + target_torch = torch.randint( + 0, + num_classes, + target_shape, + dtype=torch.long, + device=logits.torch_tensor().device, + ) + target = TestTensor.from_torch(target_torch, dt=InfiniDtype.I64, device=device) + + # 创建loss张量 + loss = TestTensor((1,), None, dt=tensor_dtype, device=device) + + # 计算PyTorch参考损失 + if len(input_shape) == 1: + # 对于一维logits,target需要是标量 + target_scalar = target.torch_tensor()[0] + pytorch_loss = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar) + else: + pytorch_loss = cross_entropy_loss_pytorch( + logits.torch_tensor(), target.torch_tensor() + ) + + # 将参考结果存储到loss张量 + loss.torch_tensor()[0] = pytorch_loss.to(loss.torch_tensor().dtype) + + if sync: + sync() + + # 创建算子描述符 + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCrossEntropyLossDescriptor( + handle, + ctypes.byref(descriptor), + loss.descriptor, + logits.descriptor, + target.descriptor, + ) + ) + + # 销毁tensor的描述符以防止内核直接使用 + for tensor in [logits, target, loss]: + tensor.destroy_desc() + + # 获取工作空间大小并创建工作空间 + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCrossEntropyLossWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, device) + + # PyTorch参考实现函数 + def torch_cross_entropy(): + if len(input_shape) == 1: + target_scalar = target.torch_tensor()[0] + result = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar) + else: + result = cross_entropy_loss_pytorch( + logits.torch_tensor(), target.torch_tensor() + ) + loss.torch_tensor()[0] = result.to(loss.torch_tensor().dtype) + + # InfiniOP实现函数 + def lib_cross_entropy(): + check_error( + LIBINFINIOP.infiniopCrossEntropyLoss( + descriptor, + workspace.data(), + workspace_size.value, + loss.data(), + logits.data(), + target.data(), + None, + ) + ) + + # 执行InfiniOP算子 + lib_cross_entropy() + + if sync: + sync() + + # 验证结果 + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + actual_loss = loss.actual_tensor()[0] + expected_loss = loss.torch_tensor()[0] + + if DEBUG: + print(f"Expected loss: {expected_loss.item()}") + print(f"Actual loss: {actual_loss.item()}") + if target_shape: + print( + f"Target shape: {target_shape}, first few targets: {target.torch_tensor().flatten()[:5]}" + ) + else: + print(f"Target (scalar): {target.torch_tensor()[0].item()}") + debug(actual_loss, expected_loss, atol=atol, rtol=rtol) + + if not torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol): + print("--- ERROR ANALYSIS ---") + print(f"Expected: {expected_loss.item()}, Actual: {actual_loss.item()}") + print(f"Difference: {abs(actual_loss - expected_loss).item()}") + print(f"Tolerance: atol={atol}, rtol={rtol}") + + assert torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol) + + # Profile功能 + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyCrossEntropyLossDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + print("\033[92mAll CrossEntropyLoss tests passed!\033[0m") diff --git a/test/infiniop/div.py b/test/infiniop/div.py new file mode 100644 index 000000000..de37404b3 --- /dev/null +++ b/test/infiniop/div.py @@ -0,0 +1,183 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def div(c, a, b): + torch.div(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device, bias=1e-6) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateDivDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetDivWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_div(): + check_error( + LIBINFINIOP.infiniopDiv( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_div() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py new file mode 100644 index 000000000..3b78098dd --- /dev/null +++ b/test/infiniop/equal.py @@ -0,0 +1,201 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES_ = [ + # shape, a_stride, b_stride + ((13, 4), None, None), + ((13, 4), (13, 1), (13, 1)), + ((13, 4, 4), (16, 4, 1), (16, 4, 1),), + ((16, 5632), None, None), +] + +class Identical(Enum): + EQUAL = auto() + NOT_EQUAL = auto() + + +_IDENTICAL = [ + Identical.EQUAL, # -> result=true + Identical.NOT_EQUAL, # -> result=false +] + +_TEST_CASES = [ + test_case + (identical_item,) + for test_case in _TEST_CASES_ + for identical_item in _IDENTICAL +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, +} + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_equal(c, a, b): + return torch.tensor(torch.equal(input=a, other=b), dtype=torch.bool) + + +def test( + handle, + device, + input_shape, + a_strides, + b_strides, + identical, + dtype, + sync=None, +): + torch_dtype = { + InfiniDtype.F16: torch.half, + InfiniDtype.F32: torch.float, + InfiniDtype.BF16: torch.bfloat16, + InfiniDtype.I32: torch.int32, + InfiniDtype.I64: torch.int64 + }[dtype] + + print( + f"Testing equal on {InfiniDeviceNames[device]} with input_shape:{input_shape}," + f"a_stride:{a_strides} b_stride:{b_strides} identical:{identical}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + torch_c = torch.tensor([False], dtype=torch.bool) + c = TestTensor( + [1], + torch_c.stride(), + InfiniDtype.BOOL, + device, + "manual", + set_tensor=torch_c + ) + + torch_a = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) + if a_strides is not None: + torch_a.as_strided_(input_shape, a_strides) + a = TestTensor( + input_shape, + torch_a.stride(), + dtype, + device, + "manual", + set_tensor=torch_a + ) + if identical == Identical.EQUAL: + torch_b = torch_a.clone() + else: + torch_b = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) + if b_strides is not None: + torch_b.as_strided_(input_shape, b_strides) + + b = TestTensor( + input_shape, + torch_b.stride(), + dtype, + device, + "manual", + set_tensor=torch_b + ) + + + c._torch_tensor = torch_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateEqualDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [c, a, b]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetEqualWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_equal(): + check_error( + LIBINFINIOP.infiniopEqual( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_equal() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor().to(torch.uint8), c.torch_tensor().to(torch.uint8), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_equal( + c.torch_tensor(), a.torch_tensor(), b.torch_tensor() + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my equal passed!\033[0m") diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py new file mode 100644 index 000000000..eb139af12 --- /dev/null +++ b/test/infiniop/exp.py @@ -0,0 +1,165 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def exp(output, input): + output.copy_(torch.exp(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + exp(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateExpDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetExpWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_exp(): + check_error( + LIBINFINIOP.infiniopExp( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_exp() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/gather.py b/test/infiniop/gather.py new file mode 100644 index 000000000..b5c8ea93d --- /dev/null +++ b/test/infiniop/gather.py @@ -0,0 +1,160 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES = [ + # input_shape, output_shape, dim, input_strides, output_strides, index_strides + ((2, 3, 7), (2, 3, 5), 2, (177, 17, 1), None, None), + ((10, 5, 4), (10, 4, 4), 1, (30, 5, 1), None, [16, 4, 1]), + ((11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None), + ((11, 20, 20, 13, 37), (11, 20, 20, 13, 37), 1, None, None, None) +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_gather(output, input, dim, index): + torch.gather(input, dim, index, out=output) + +def test( + handle, + device, + input_shape, output_shape, dim, input_strides, output_strides, index_strides, + dtype, + sync=None, +): + print( + f"Testing Gather on {InfiniDeviceNames[device]} with input shape:{input_shape}, dim:{dim}, output_shape:{output_shape}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device + ) + torch_index = torch.randint(low=0, high=input_shape[dim], size=output_shape, dtype=torch.int64) + if index_strides: + torch_index = torch_index.as_strided(output_shape, index_strides) + index = TestTensor( + output_shape, + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + ) + + torch_gather(output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGatherDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGatherWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, input.device) + + def lib_gather(): + check_error( + LIBINFINIOP.infiniopGather( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_gather() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print("x:", input.torch_tensor()) + # print("CALCULATED:\n", output.actual_tensor(), ) + # print("GT\n", output.torch_tensor()) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_gather( + output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor() + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gather(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyGatherDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my Gather passed!\033[0m") diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py new file mode 100644 index 000000000..424b30567 --- /dev/null +++ b/test/infiniop/hardswish.py @@ -0,0 +1,167 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def hardswish(output, input): + output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + hardswish(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateHardswishDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardswishWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_hardswish(): + check_error( + LIBINFINIOP.infiniopHardswish( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_hardswish() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/index_copy_inplace.py b/test/infiniop/index_copy_inplace.py new file mode 100644 index 000000000..97dbd8266 --- /dev/null +++ b/test/infiniop/index_copy_inplace.py @@ -0,0 +1,180 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +import random + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + +_TEST_CASES = [ + # input_shape, output_shape, dim, output_strides, input_strides, + ([13, 1], [13, 4], 1, [37, 1], [37, 1], Inplace.OUT_OF_PLACE), + ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.INPLACE), + ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.OUT_OF_PLACE), + ([133, 23, 53], [133, 23, 53], 1, None, None, Inplace.OUT_OF_PLACE), + ([133, 23, 13, 53], [133, 23, 13, 53], 2, None, None, Inplace.OUT_OF_PLACE), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_index_copy_inplace(output, input, index, dim): + output.index_copy_(dim, index, input.clone()) + + +def test( + handle, + device, + input_shape, output_shape, dim, output_strides, input_strides, + inplace, + dtype, + sync=None, +): + print( + f"Testing index_copy_inplace on {InfiniDeviceNames[device]} with shape:{input_shape}," + f"inplace:{inplace}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device, + ) + if inplace == Inplace.INPLACE: + assert output_shape == input_shape + output = input + else: + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + "zeros", + ) + + index_list = list(range(output_shape[dim])) + + random.shuffle(index_list) + torch_index = torch.tensor(index_list[:input_shape[dim]], dtype=torch.int64) + index = TestTensor( + [input_shape[dim]], + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + + torch_index_copy_inplace(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetIndexCopyInplaceWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_index_copy_inplace(): + check_error( + LIBINFINIOP.infiniopIndexCopyInplace( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_index_copy_inplace() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print('input:\n', input.torch_tensor()) + # print('index:\n', index.torch_tensor()) + # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), ) + + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_index_copy_inplace( + output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_index_copy_inplace(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my index_copy_inplace passed!\033[0m") diff --git a/test/infiniop/interpolate_nearest.py b/test/infiniop/interpolate_nearest.py new file mode 100644 index 000000000..335bcd7fd --- /dev/null +++ b/test/infiniop/interpolate_nearest.py @@ -0,0 +1,254 @@ +import torch +import ctypes +from ctypes import c_uint64 + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +from typing import List, Tuple +import math +from torch.nn import functional as F + +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# Test cases: (input_shape, input_stride, output_shape, output_stride) +_TEST_CASES = [ + # 2D test cases - simplified to one line each + ((1, 1, 2, 2), None, (1, 1, 4, 4), None), # Simple contiguous case + ((1, 3, 4, 4), (48, 16, 4, 1), (1, 3, 8, 8), (192, 64, 8, 1)), # 2D upscaling 2x + ((1, 3, 8, 8), (192, 64, 8, 1), (1, 3, 4, 4), (48, 16, 4, 1)), # 2D downscaling 2x + ((2, 4, 2, 2), (16, 4, 2, 1), (2, 4, 6, 6), (144, 36, 6, 1)), # Batch upscaling + ( + (1, 1, 3, 5), + (15, 15, 5, 1), + (1, 1, 9, 10), + (90, 90, 10, 1), + ), # Different aspect ratio + ( + (4, 64, 16, 16), + (16384, 256, 16, 1), + (4, 64, 32, 32), + (65536, 1024, 32, 1), + ), # Large batch + ((1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 7, 7), (49, 49, 7, 1)), # Small to large + ( + (1, 2, 3, 4), + (24, 1, 8, 2), + (1, 2, 6, 8), + (96, 1, 16, 2), + ), # Non-contiguous layout + ((2, 3, 2, 2), (32, 8, 4, 1), (2, 3, 4, 4), (128, 32, 8, 1)), # Padded strides + # 1D test cases + ((1, 3, 8), (24, 8, 1), (1, 3, 16), (48, 16, 1)), # 1D upscaling 2x + ((2, 5, 10), (50, 10, 1), (2, 5, 5), (25, 5, 1)), # 1D downscaling 2x + ((4, 2, 32), (64, 32, 1), (4, 2, 64), (128, 64, 1)), # 1D larger upscaling + # 3D test cases + ( + (1, 2, 2, 2, 2), + (16, 8, 4, 2, 1), + (1, 2, 4, 4, 4), + (128, 64, 16, 4, 1), + ), # 3D upscaling 2x + ( + (1, 1, 2, 3, 4), + (24, 24, 12, 4, 1), + (1, 1, 4, 6, 8), + (192, 192, 48, 8, 1), + ), # 3D uniform upscaling + ( + (3, 2, 5, 5, 5), + (250, 125, 25, 5, 1), + (3, 2, 3, 3, 3), + (54, 27, 9, 3, 1), + ), # 3D non-uniform scaling +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.I8] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def interpolate_nearest(input_tensor, output_shape, output_tensor): + """ + Perform nearest neighbor interpolation using PyTorch as reference + """ + # Extract spatial dimensions (H, W) + target_size = output_shape[2:] # Skip batch and channel dimensions + + # Use PyTorch's interpolate function with nearest mode + if input_tensor.dtype in [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + ]: + # 对于整数类型,先转换为 float32,进行插值,再转换回原类型 + original_dtype = input_tensor.dtype + + # 转换为 float32 进行插值 + float_input = input_tensor.float() + result = F.interpolate(float_input, size=target_size, mode="nearest") + + # 转换回原始类型 + result = result.to(original_dtype) + else: + result = F.interpolate(input_tensor, size=target_size, mode="nearest") + + output_tensor.copy_(result) + + +def test( + handle, + device, + input_shape, + input_stride, + output_shape, + output_stride, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + # Create input and output tensors + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + output_tensor = TestTensor( + output_shape, output_stride, dt=tensor_dtype, device=device + ) + + print( + f"Testing InterpolateNearest on {InfiniDeviceNames[device]} with " + f"input_shape: {input_shape}, output_shape: {output_shape}, " + f"input_stride: {input_stride}, output_stride: {output_stride}, " + f"dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + # Compute reference result using PyTorch + interpolate_nearest( + input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor() + ) + + if sync is not None: + sync() + + # Create descriptor for our interpolate_nearest operator + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateInterpolateNearestDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output_tensor]: + if tensor is not None: + tensor.destroy_desc() + + # Get workspace size + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetInterpolateNearestWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_interpolate_nearest(): + check_error( + LIBINFINIOP.infiniopInterpolateNearest( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + # Execute the operation + lib_interpolate_nearest() + + # Check results + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + output_tensor.torch_tensor(), + atol=atol, + rtol=rtol, + ), f"Results don't match for shape {input_shape} -> {output_shape}" + + # Profiling workflow + if PROFILE: + profile_operation( + "PyTorch", + lambda: interpolate_nearest( + input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor() + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", + lambda: lib_interpolate_nearest(), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyInterpolateNearestDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/layer_norm.py b/test/infiniop/layer_norm.py new file mode 100644 index 000000000..aacf07186 --- /dev/null +++ b/test/infiniop/layer_norm.py @@ -0,0 +1,265 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES_ = [ + # shape, bias_exist, eps, input_strides, output_strides, weight_strides + ((13, 4, 4), True, 1e-5, [30, 4, 1], [50, 4, 1], [2]), + ((16, 5, 563), True, 1e-4, None, None, None), + ((5, 16, 563), False, 1e-5, None, None, [10]), + ((4, 4, 563), True, 1e-5, None, None, None), + ((40, 40, 56), True, 1e-5, [3600, 56, 1], None, None), + ((40, 40, 56), False, 1e-5, [3600, 56, 1], None, None), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 5e-2, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 5e-2, "rtol": 5e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_layer_norm( + output:torch.Tensor, + input_standardization:torch.Tensor, + input_std_deviation:torch.Tensor, + input:torch.Tensor, + weight, bias, eps, + bias_exist: bool +): + normalized_shape = input.shape[-1:] + ln = torch.nn.LayerNorm( + normalized_shape=normalized_shape, + eps=eps, + dtype=torch.float, + bias=bias_exist, + device=input.device + ) + ln.weight.data = weight.type(torch.float) + if bias_exist: + ln.bias.data = bias.type(torch.float) + input = input.type(torch.float) + mean = input.mean(dim=-1, keepdim=True) + var = input.var(dim=-1, correction=0) + std = torch.sqrt(var + eps) + input_standardization.copy_( + ((input - mean) / std.unsqueeze(2)).type(input_standardization.dtype) + ) + input_std_deviation.copy_(std.type(input_standardization.dtype)) + output.copy_(ln(input).detach().type(output.dtype)) + + + +def test( + handle, + device, + input_shape, + bias_exist, + eps, + input_strides, + output_strides, + weight_strides, + inplace, + dtype, + sync=None, +): + print( + f"Testing layer_norm on {InfiniDeviceNames[device]} with input_shape:{input_shape}," + f"bias:{bias_exist},eps:{eps}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + input_standardization = TestTensor( + input_shape, + None, + dtype, + device, + ) + + input_std_deviation = TestTensor( + input_shape[:-1], + None, + dtype, + device, + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device, + ) + if inplace == Inplace.INPLACE: + if output_strides != input_strides: + return + output = input + else: + output = TestTensor( + input_shape, + output_strides, + dtype, + device, + ) + + + weight = TestTensor( + input_shape[-1:], + weight_strides, + dtype, + device, + ) + + bias = TestTensor( + input_shape[-1:], + None, + dtype, + device, + ) if bias_exist else None + + torch_layer_norm( + output.torch_tensor(), + input_standardization.torch_tensor(), + input_std_deviation.torch_tensor(), + input.torch_tensor(), + weight.torch_tensor(), + bias.torch_tensor() if bias_exist else None, + eps, + bias_exist + ) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLayerNormDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input_standardization.descriptor, + input_std_deviation.descriptor, + input.descriptor, + weight.descriptor, + bias.descriptor if bias_exist else None, + eps, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input_standardization, input_std_deviation, input, weight] + [bias] if bias_exist else []: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLayerNormWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_layer_norm(): + check_error( + LIBINFINIOP.infiniopLayerNorm( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input_standardization.data(), + input_std_deviation.data(), + input.data(), + weight.data(), + bias.data() if bias_exist else None, + None, + ) + ) + + lib_layer_norm() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + debug(input_standardization.actual_tensor(), input_standardization.torch_tensor(), atol=atol, rtol=rtol) + debug(input_std_deviation.actual_tensor(), input_std_deviation.torch_tensor(), atol=atol, rtol=rtol) + # print('input:\n', input.torch_tensor(), '\n') + # print('weight:\n', weight.torch_tensor(), '\n') + # print('bias:\n', bias.torch_tensor(), '\n') + # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), ) + # print('input_standardization:\n', input_standardization.torch_tensor(), '\n', input_standardization.actual_tensor(), ) + # print('input_std_deviation:\n', input_std_deviation.torch_tensor(), '\n', input_std_deviation.actual_tensor(), ) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(input_standardization.actual_tensor(), input_standardization.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(input_std_deviation.actual_tensor(), input_std_deviation.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_layer_norm( + output, input_standardization, input_std_deviation, input, weight, bias, eps, bias_exist + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_layer_norm(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLayerNormDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my layer_norm passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 869e4aa86..ae20ccf2e 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -4,7 +4,7 @@ infiniopOperatorDescriptor_t, ) -from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float +from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool class OpRegister: @@ -565,7 +565,6 @@ def dequantize_(lib): infiniopOperatorDescriptor_t, ] - @OpRegister.operator def softplus_(lib): lib.infiniopCreateSoftplusDescriptor.restype = c_int32 @@ -586,3 +585,670 @@ def softplus_(lib): ] lib.infiniopDestroySoftplusDescriptor.restype = c_int32 lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def cross_entropy_loss_(lib): + lib.infiniopCreateCrossEntropyLossDescriptor.restype = c_int32 + lib.infiniopCreateCrossEntropyLossDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # loss_desc + infiniopTensorDescriptor_t, # logits_desc + infiniopTensorDescriptor_t, # target_desc + ] + + lib.infiniopGetCrossEntropyLossWorkspaceSize.restype = c_int32 + lib.infiniopGetCrossEntropyLossWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopCrossEntropyLoss.restype = c_int32 + lib.infiniopCrossEntropyLoss.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # loss + c_void_p, # logits + c_void_p, # target + c_void_p, # stream + ] + + lib.infiniopDestroyCrossEntropyLossDescriptor.restype = c_int32 + lib.infiniopDestroyCrossEntropyLossDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def avg_pool_(lib): + lib.infiniopCreateAvgPoolDescriptor.restype = c_int32 + lib.infiniopCreateAvgPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # output_desc + infiniopTensorDescriptor_t, # input_desc + c_void_p, # kernel_size + c_void_p, # strides + c_void_p, # pads + c_bool, # ceil_mode + ] + + + lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopAvgPool.restype = c_int32 + lib.infiniopAvgPool.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # output + c_void_p, # input + c_void_p, # stream + ] + + lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32 + lib.infiniopDestroyAvgPoolDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def exp_(lib): + lib.infiniopCreateExpDescriptor.restype = c_int32 + lib.infiniopCreateExpDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetExpWorkspaceSize.restype = c_int32 + lib.infiniopGetExpWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + + lib.infiniopExp.restype = c_int32 + lib.infiniopExp.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyExpDescriptor.restype = c_int32 + lib.infiniopDestroyExpDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def sin_(lib): + lib.infiniopCreateSinDescriptor.restype = c_int32 + lib.infiniopCreateSinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetSinWorkspaceSize.restype = c_int32 + lib.infiniopGetSinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopSin.restype = c_int32 + lib.infiniopSin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroySinDescriptor.restype = c_int32 + lib.infiniopDestroySinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def tanh_(lib): + lib.infiniopCreateTanhDescriptor.restype = c_int32 + lib.infiniopCreateTanhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetTanhWorkspaceSize.restype = c_int32 + lib.infiniopGetTanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopTanh.restype = c_int32 + lib.infiniopTanh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyTanhDescriptor.restype = c_int32 + lib.infiniopDestroyTanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def max_pool_(lib): + lib.infiniopCreateMaxPoolDescriptor.restype = c_int32 + lib.infiniopCreateMaxPoolDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # output_desc + infiniopTensorDescriptor_t, # input_desc + c_void_p, # kernel_size + c_void_p, # strides + c_void_p, # pads + c_bool, # ceil_mode + ] + + lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32 + lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMaxPool.restype = c_int32 + lib.infiniopMaxPool.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # output + c_void_p, # input + c_void_p, # stream + ] + + lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32 + lib.infiniopDestroyMaxPoolDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def interpolate_nearest_(lib): + lib.infiniopCreateInterpolateNearestDescriptor.restype = c_int32 + lib.infiniopCreateInterpolateNearestDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, # output_desc + infiniopTensorDescriptor_t, # input_desc + ] + + lib.infiniopGetInterpolateNearestWorkspaceSize.restype = c_int32 + lib.infiniopGetInterpolateNearestWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopInterpolateNearest.restype = c_int32 + lib.infiniopInterpolateNearest.argtypes = [ + infiniopOperatorDescriptor_t, # descriptor + c_void_p, # workspace + c_size_t, # workspace_size + c_void_p, # output + c_void_p, # input + c_void_p, # stream + ] + + lib.infiniopDestroyInterpolateNearestDescriptor.restype = c_int32 + lib.infiniopDestroyInterpolateNearestDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def hardswish_(lib): + lib.infiniopCreateHardswishDescriptor.restype = c_int32 + lib.infiniopCreateHardswishDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetHardswishWorkspaceSize.restype = c_int32 + lib.infiniopGetHardswishWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopHardswish.restype = c_int32 + lib.infiniopHardswish.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyHardswishDescriptor.restype = c_int32 + lib.infiniopDestroyHardswishDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def where_(lib): + lib.infiniopCreateWhereDescriptor.restype = c_int32 + lib.infiniopCreateWhereDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetWhereWorkspaceSize.restype = c_int32 + lib.infiniopGetWhereWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopWhere.restype = c_int32 + lib.infiniopWhere.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyWhereDescriptor.restype = c_int32 + lib.infiniopDestroyWhereDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def reduce_max_(lib): + lib.infiniopCreateReduceMaxDescriptor.restype = c_int32 + lib.infiniopCreateReduceMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopReduceMax.restype = c_int32 + lib.infiniopReduceMax.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMaxDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def div_(lib): + lib.infiniopCreateDivDescriptor.restype = c_int32 + lib.infiniopCreateDivDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetDivWorkspaceSize.restype = c_int32 + lib.infiniopGetDivWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopDiv.restype = c_int32 + lib.infiniopDiv.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyDivDescriptor.restype = c_int32 + lib.infiniopDestroyDivDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def equal_(lib): + lib.infiniopCreateEqualDescriptor.restype = c_int32 + lib.infiniopCreateEqualDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetEqualWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopEqual.restype = c_int32 + lib.infiniopEqual.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyEqualDescriptor.restype = c_int32 + lib.infiniopDestroyEqualDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def batch_norm_(lib): + lib.infiniopCreateBatchNormDescriptor.restype = c_int32 + lib.infiniopCreateBatchNormDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + c_float, + ] + lib.infiniopGetBatchNormWorkspaceSize.restype = c_int32 + lib.infiniopGetBatchNormWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopBatchNorm.restype = c_int32 + lib.infiniopBatchNorm.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyBatchNormDescriptor.restype = c_int32 + lib.infiniopDestroyBatchNormDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def scatter_(lib): + lib.infiniopCreateScatterDescriptor.restype = c_int32 + lib.infiniopCreateScatterDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetScatterWorkspaceSize.restype = c_int32 + lib.infiniopGetScatterWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopScatter.restype = c_int32 + lib.infiniopScatter.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyScatterDescriptor.restype = c_int32 + lib.infiniopDestroyScatterDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def gather_(lib): + lib.infiniopCreateGatherDescriptor.restype = c_int32 + lib.infiniopCreateGatherDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetGatherWorkspaceSize.restype = c_int32 + lib.infiniopGetGatherWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopGather.restype = c_int32 + lib.infiniopGather.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyGatherDescriptor.restype = c_int32 + lib.infiniopDestroyGatherDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def index_copy_inplace_(lib): + lib.infiniopCreateIndexCopyInplaceDescriptor.restype = c_int32 + lib.infiniopCreateIndexCopyInplaceDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetIndexCopyInplaceWorkspaceSize.restype = c_int32 + lib.infiniopGetIndexCopyInplaceWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopIndexCopyInplace.restype = c_int32 + lib.infiniopIndexCopyInplace.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyIndexCopyInplaceDescriptor.restype = c_int32 + lib.infiniopDestroyIndexCopyInplaceDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def layer_norm_(lib): + lib.infiniopCreateLayerNormDescriptor.restype = c_int32 + lib.infiniopCreateLayerNormDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + ] + lib.infiniopGetLayerNormWorkspaceSize.restype = c_int32 + lib.infiniopGetLayerNormWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLayerNorm.restype = c_int32 + lib.infiniopLayerNorm.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLayerNormDescriptor.restype = c_int32 + lib.infiniopDestroyLayerNormDescriptor.argtypes = [infiniopOperatorDescriptor_t] + + +@OpRegister.operator +def logical_or_(lib): + lib.infiniopCreateLogicalOrDescriptor.restype = c_int32 + lib.infiniopCreateLogicalOrDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLogicalOrWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalOrWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLogicalOr.restype = c_int32 + lib.infiniopLogicalOr.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLogicalOrDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalOrDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def logical_and_(lib): + lib.infiniopCreateLogicalAndDescriptor.restype = c_int32 + lib.infiniopCreateLogicalAndDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLogicalAndWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalAndWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLogicalAnd.restype = c_int32 + lib.infiniopLogicalAnd.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLogicalAndDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalAndDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def reduce_mean_(lib): + lib.infiniopCreateReduceMeanDescriptor.restype = c_int32 + lib.infiniopCreateReduceMeanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopReduceMean.restype = c_int32 + lib.infiniopReduceMean.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMeanDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 510e3d2fa..cdcfbc515 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -66,10 +66,34 @@ def __init__( torch_strides.append(strides[i]) else: torch_shape.append(shape[i]) + + is_bool = dt == InfiniDtype.BOOL + if is_bool: + dt = InfiniDtype.F32 + + is_int = ( + dt == InfiniDtype.I8 + or dt == InfiniDtype.I16 + or dt == InfiniDtype.I32 + or dt == InfiniDtype.I64 + ) + + torch_dtype = to_torch_dtype(dt) if mode == "random": - self._torch_tensor = torch.rand( - torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] - ) + if is_int: + self._torch_tensor = torch.randint( + 0, + 100, + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) + else: + self._torch_tensor = torch.rand( + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) elif mode == "zeros": self._torch_tensor = torch.zeros( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] @@ -90,10 +114,36 @@ def __init__( else: raise ValueError("Unsupported mode") + if is_bool: + self._torch_tensor = self._torch_tensor > 0.5 + if scale is not None: - self._torch_tensor *= scale + if torch_dtype in [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + ]: + # 对于整数类型,先转换为 float,应用 scale,再转换回原类型 + self._torch_tensor = (self._torch_tensor.float() * scale).to( + torch_dtype + ) + else: + self._torch_tensor *= scale + if bias is not None: - self._torch_tensor += bias + if torch_dtype in [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + ]: + # 对于整数类型,先转换为 float,应用 bias,再转换回原类型 + self._torch_tensor = (self._torch_tensor.float() + bias).to(torch_dtype) + else: + self._torch_tensor += bias if strides is not None: self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides) @@ -142,6 +192,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): return torch.float32 elif dt == InfiniDtype.F64: return torch.float64 + elif dt == InfiniDtype.BOOL: + return torch.bool # TODO: These following types may not be supported by older # versions of PyTorch. Use compatability mode to convert them. elif dt == InfiniDtype.U16: diff --git a/test/infiniop/logical_and.py b/test/infiniop/logical_and.py new file mode 100644 index 000000000..b44b83968 --- /dev/null +++ b/test/infiniop/logical_and.py @@ -0,0 +1,196 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, + InfiniDtype.I16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.F64: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def logical_and(c, a, b): + torch.logical_and(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing LogicalAnd on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogicalAndDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogicalAndWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_logical_and(): + check_error( + LIBINFINIOP.infiniopLogicalAnd( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_logical_and() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.equal(c.actual_tensor(), c.torch_tensor()) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_logical_and(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLogicalAndDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/logical_or.py b/test/infiniop/logical_or.py new file mode 100644 index 000000000..7bf991052 --- /dev/null +++ b/test/infiniop/logical_or.py @@ -0,0 +1,196 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, + InfiniDtype.I16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.F64: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def logical_or(c, a, b): + torch.logical_or(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing LogicalOr on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogicalOrDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogicalOrWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_logical_or(): + check_error( + LIBINFINIOP.infiniopLogicalOr( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_logical_or() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.equal(c.actual_tensor(), c.torch_tensor()) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_logical_or(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLogicalOrDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/maxpool.py b/test/infiniop/maxpool.py new file mode 100644 index 000000000..81ddce060 --- /dev/null +++ b/test/infiniop/maxpool.py @@ -0,0 +1,242 @@ +import torch +import ctypes +from ctypes import c_uint64, c_bool + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +from typing import List, Tuple +import math +from torch.nn import functional as F + +# Configuration for profiling +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +# Test cases: (input_shape, input_stride, kernel_size, stride, padding, ceil_mode) +_TEST_CASES = [ + # 1D max pooling cases + ((1, 3, 8), None, (2,), (2,), (0,), False), + ((2, 4, 16), None, (3,), (2,), (1,), False), + ((3, 2, 77), None, (6,), (4,), (3,), True), + # 2D max pooling cases + ((1, 1, 4, 4), None, (2, 2), (2, 2), (0, 0), False), + ((2, 3, 8, 8), None, (3, 3), (2, 2), (1, 1), False), + ((1, 64, 32, 32), None, (2, 2), (2, 2), (0, 0), False), + ((4, 128, 16, 16), None, (3, 3), (1, 1), (1, 1), False), + # 3D max pooling cases + ((1, 1, 4, 4, 4), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False), + ((2, 2, 8, 8, 8), None, (2, 3, 3), (2, 2, 2), (0, 1, 1), False), + # Cases with ceil_mode=True + ((1, 1, 7, 7), None, (3, 3), (2, 2), (1, 1), True), + ((1, 2, 5), None, (3,), (2,), (0,), True), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + + +def max_pool(input_tensor, kernel_size, stride, padding, ceil_mode): + """ + Perform max pooling using PyTorch as reference + """ + ndim = len(input_tensor.shape) - 2 # Spatial dimensions + + if ndim == 1: + result = F.max_pool1d( + input_tensor, + kernel_size=kernel_size[0], + stride=stride[0], + padding=padding[0], + ceil_mode=ceil_mode, + ) + elif ndim == 2: + result = F.max_pool2d( + input_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + ) + elif ndim == 3: + result = F.max_pool3d( + input_tensor, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ceil_mode=ceil_mode, + ) + else: + raise ValueError(f"Unsupported spatial dimensions: {ndim}") + + return result + + +def tuple_to_void_p(py_tuple: Tuple): + """Convert a python tuple to a ctype void pointer""" + array = ctypes.c_uint64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.c_void_p) + + +def test( + handle, + device, + input_shape, + input_stride, + kernel_size, + stride, + padding, + ceil_mode, + tensor_dtype=InfiniDtype.F16, + sync=None, +): + # Create input tensor + input_tensor = TestTensor( + input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0 + ) + + # Compute reference result using PyTorch + torch_ref_output = max_pool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + ) + + # Use PyTorch输出shape来初始化output_tensor + output_tensor = TestTensor( + torch_ref_output.shape, None, dt=tensor_dtype, device=device + ) + + print( + f"Testing MaxPool on {InfiniDeviceNames[device]} with " + f"input_shape: {input_shape}, kernel_size: {kernel_size}, " + f"stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, " + f"dtype: {InfiniDtypeNames[tensor_dtype]}" + ) + + if sync is not None: + sync() + + # Create descriptor for our max pool operator + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMaxPoolDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + tuple_to_void_p(kernel_size), + tuple_to_void_p(stride), + tuple_to_void_p(padding), + c_bool(ceil_mode), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input_tensor, output_tensor]: + if tensor is not None: + tensor.destroy_desc() + + # Get workspace size + workspace_size = ctypes.c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMaxPoolWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output_tensor.device) + + def lib_max_pool(): + check_error( + LIBINFINIOP.infiniopMaxPool( + descriptor, + workspace.data(), + workspace_size.value, + output_tensor.data(), + input_tensor.data(), + None, + ) + ) + + # Execute the operation + lib_max_pool() + + # Check results + atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype) + if DEBUG: + debug( + output_tensor.actual_tensor(), + torch_ref_output, + atol=atol, + rtol=rtol, + ) + + assert torch.allclose( + output_tensor.actual_tensor(), + torch_ref_output, + atol=atol, + rtol=rtol, + ), f"Results don't match for input_shape {input_shape}, kernel_size {kernel_size}" + + # Profiling workflow + if PROFILE: + profile_operation( + "PyTorch", + lambda: max_pool( + input_tensor.torch_tensor(), + kernel_size, + stride, + padding, + ceil_mode, + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + " lib", lambda: lib_max_pool(), device, NUM_PRERUN, NUM_ITERATIONS + ) + + # Clean up + check_error(LIBINFINIOP.infiniopDestroyMaxPoolDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/reduce_max.py b/test/infiniop/reduce_max.py new file mode 100644 index 000000000..3b738ec1e --- /dev/null +++ b/test/infiniop/reduce_max.py @@ -0,0 +1,154 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not maxt to be imported from other modules +_TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + ((), (), None, None, 0), + ((1,), (32,), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), +] + +# x types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TEST_CASES = _TEST_CASES_ + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def reduce_max(x, dim): + return x.max(dim=dim, keepdim=True)[0] + + +def test( + handle, + device, + y_shape, + x_shape, + y_stride, + x_stride, + dim, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing Reduce_Max on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}" + f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, x_stride, dtype, device) + ans = reduce_max(x.torch_tensor(), dim) + + y = TestTensor(y_shape, y_stride, dtype, device) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReduceMaxDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ctypes.c_size_t(dim), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x.destroy_desc() + y.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReduceMaxWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_reduce_max(): + check_error( + LIBINFINIOP.infiniopReduceMax( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + None, + ) + ) + + lib_reduce_max() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: reduce_max(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_reduce_max(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReduceMaxDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py new file mode 100644 index 000000000..dfa5ee7a8 --- /dev/null +++ b/test/infiniop/reduce_mean.py @@ -0,0 +1,154 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + ((), (), None, None, 0), + ((1,), (32,), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), +] + +# x types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TEST_CASES = _TEST_CASES_ + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def reduce_mean(x, dim): + return x.mean(dim=dim, keepdim=True) + + +def test( + handle, + device, + y_shape, + x_shape, + y_stride, + x_stride, + dim, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing Reduce_Mean on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}" + f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, x_stride, dtype, device) + ans = reduce_mean(x.torch_tensor(), dim) + + y = TestTensor(y_shape, y_stride, dtype, device) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReduceMeanDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ctypes.c_size_t(dim), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x.destroy_desc() + y.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReduceMeanWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_reduce_mean(): + check_error( + LIBINFINIOP.infiniopReduceMean( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + None, + ) + ) + + lib_reduce_mean() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: reduce_mean(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_reduce_mean(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReduceMeanDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/scatter.py b/test/infiniop/scatter.py new file mode 100644 index 000000000..86ccdcdeb --- /dev/null +++ b/test/infiniop/scatter.py @@ -0,0 +1,196 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +import random + +_TEST_CASES = [ + # input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides + ((6, 7), (6, 7), (6, 7), 1, (7, 1), (1, 7), None), + ((2, 3, 7), (2, 3, 5), (2, 3, 5), 2, (1, 2, 6), None, None), + ((10, 5, 4), (10, 4, 4), (10, 4, 4), 1, None, None, [16, 4, 1]), + ((11, 2, 2, 4), (11, 2, 2, 4), (11, 2, 2, 4), 0, None, [16, 8, 4, 1], None), +] + + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_scatter(output: torch.Tensor, input, index, dim): + output.scatter_(dim, index, src=input) + + +def test( + handle, + device, + input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides, + dtype, + sync=None, +): + print( + f"Testing scatter on {InfiniDeviceNames[device]} with input_shape:{input_shape}, index_shape:{index_shape}, output_shape:{output_shape}, dim:{dim}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + "zeros", + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device, + ) + + def get_test_index_tensor(input_shape, index_shape, output_shape, scatter_dim): + index = torch.empty(index_shape, dtype=torch.int64) + ndim = len(input_shape) + if ndim == 2 and scatter_dim == 1: + for i in range(input.shape[0]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 3 and scatter_dim == 2: + for i in range(input.shape[0]): + for j in range(input.shape[1]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, j, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 3 and scatter_dim == 1: + for i in range(input.shape[0]): + for j in range(input.shape[2]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, :, j] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 4 and scatter_dim == 0: + for i in range(input.shape[1]): + for j in range(input.shape[2]): + for k in range(input.shape[3]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[:, i, j, k] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + return index + + torch_index = get_test_index_tensor(input_shape, index_shape, output_shape, dim).type(torch.int64) + if index_strides: + torch_index = torch_index.as_strided(index_shape, index_strides) + index = TestTensor( + index_shape, + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + + torch_scatter(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateScatterDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetScatterWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_scatter(): + check_error( + LIBINFINIOP.infiniopScatter( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_scatter() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print('input:\n', input.torch_tensor()) + # print('index:\n', index.torch_tensor()) + # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), ) + + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_scatter( + output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_scatter(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyScatterDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my scatter passed!\033[0m") diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py new file mode 100644 index 000000000..613257e9c --- /dev/null +++ b/test/infiniop/sin.py @@ -0,0 +1,166 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sin(output, input): + output.copy_(torch.sin(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + sin(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSinDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_sin(): + check_error( + LIBINFINIOP.infiniopSin( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_sin() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sin(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py new file mode 100644 index 000000000..dc6ec46e8 --- /dev/null +++ b/test/infiniop/tanh.py @@ -0,0 +1,166 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def tanh(output, input): + output.copy_(torch.tanh(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + tanh(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTanhDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_tanh(): + check_error( + LIBINFINIOP.infiniopTanh( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_tanh() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/where.py b/test/infiniop/where.py new file mode 100644 index 000000000..c940d4f05 --- /dev/null +++ b/test/infiniop/where.py @@ -0,0 +1,288 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, + to_torch_dtype, + torch_device_map, +) +from enum import Enum, auto + +# ====================================================================== +# Configuration (Internal Use Only) +# Now each test case tuple is: (shape, a_stride, b_stride, cond_stride, c_stride) +# ====================================================================== +_TEST_CASES_ = [ + ((13, 4), None, None, None, None), + ((13, 4), None, None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None, None), + ((13, 4, 4), None, None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None, None), + ((16, 5632), None, None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_COND = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, + Inplace.INPLACE_COND, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +_INTEGER_DTYPES = [ + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.U32, + InfiniDtype.U64, +] + +_FLOAT_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, + InfiniDtype.BF16, +] + +_TENSOR_DTYPES = _INTEGER_DTYPES + _FLOAT_DTYPES + +_TOLERANCE_MAP = { + InfiniDtype.I32: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.I64: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.U32: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.U64: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def is_supported_dt(inf_dt): + try: + td = to_torch_dtype(inf_dt, compatability_mode=True) + _ = torch.empty((1,), dtype=td, device="cpu") + return True + except Exception: + return False + +def _is_integer_dtype(inf_dt): + return inf_dt in _INTEGER_DTYPES + +def _is_unsigned_dtype(inf_dt): + return inf_dt in (InfiniDtype.U32, InfiniDtype.U64) + + +def make_integer_torch_tensor(shape, inf_dt, device): + use_compatibility = _is_unsigned_dtype(inf_dt) + + if inf_dt == InfiniDtype.I32: + low, high, dtype = -2000, 2000, torch.int32 + elif inf_dt == InfiniDtype.I64: + low, high, dtype = -2048, 2048, torch.int64 + elif inf_dt == InfiniDtype.U32: + low, high, dtype = 0, 2000, torch.int32 + elif inf_dt == InfiniDtype.U64: + low, high, dtype = 0, 2048, torch.int64 + else: + low, high, dtype = 0, 1, torch.int64 + + dev = torch_device_map[device] + + t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev) + + target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility) + if t.dtype != target_torch_dt: + t = t.to(dtype=target_torch_dt) + + return t + +def where_ref(c, a, b, cond): + cond_bool = cond.torch_tensor().to(torch.bool) + c.torch_tensor().copy_(torch.where(cond_bool, a.torch_tensor(), b.torch_tensor())) + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + cond_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + inf_dt = dtype + + if not is_supported_dt(inf_dt): + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on this platform") + return + + try: + if _is_integer_dtype(inf_dt): + a_torch = make_integer_torch_tensor(shape, inf_dt, device) + b_torch = make_integer_torch_tensor(shape, inf_dt, device) + a = TestTensor.from_torch(a_torch, inf_dt, device) + b = TestTensor.from_torch(b_torch, inf_dt, device) + else: + a = TestTensor(shape, a_stride, inf_dt, device, mode="random") + b = TestTensor(shape, b_stride, inf_dt, device, mode="random") + except RuntimeError as e: + msg = str(e) + if "not implemented for 'UInt32'" in msg or "not implemented for 'UInt64'" in msg or "check_uniform_bounds" in msg: + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} because platform torch can't build random tensor: {e}") + return + else: + raise + + dev = torch_device_map[device] + if _is_integer_dtype(inf_dt): + cond_torch = torch.randint(0, 2, size=shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev) + else: + cond_bool = (torch.rand(shape, device=dev) > 0.5) + cond_torch = cond_bool.to(dtype=to_torch_dtype(inf_dt, compatability_mode=False)) + + cond = TestTensor.from_torch(cond_torch, inf_dt, device) + + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + elif inplace == Inplace.INPLACE_COND: + if c_stride != cond_stride: + return + c = cond + else: + if _is_integer_dtype(inf_dt): + dev = torch_device_map[device] + c_torch = torch.zeros(shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev) + c = TestTensor.from_torch(c_torch, inf_dt, device) + else: + c = TestTensor(shape, c_stride, inf_dt, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Where on {InfiniDeviceNames[device]} " + f"shape:{shape} a_stride:{a_stride} b_stride:{b_stride} cond_stride:{cond_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[inf_dt]} inplace:{inplace}" + ) + + where_ref(c, a, b, cond) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + try: + check_error( + LIBINFINIOP.infiniopCreateWhereDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + cond.descriptor, + ) + ) + except Exception as e: + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on {InfiniDeviceNames[device]}: CreateWhereDescriptor failed: {e}") + return + + for tensor in [a, b, c, cond]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetWhereWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_where(): + check_error( + LIBINFINIOP.infiniopWhere( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + cond.data(), + None, + ) + ) + + lib_where() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, inf_dt) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + profile_operation("PyTorch", lambda: where_ref(c, a, b, cond), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_where(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor)) + + +def main(): + args = get_args() + global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + supported = [dt for dt in _TENSOR_DTYPES if is_supported_dt(dt)] + devices = get_test_devices(args) + + for device in devices: + test_operator(device, test, _TEST_CASES, supported) + + print("\033[92mTest passed!\033[0m") + + +if __name__ == "__main__": + main() diff --git a/xmake.lua b/xmake.lua index 67add0d45..fbb5156c3 100644 --- a/xmake.lua +++ b/xmake.lua @@ -110,6 +110,16 @@ if has_config("metax-gpu") then includes("xmake/metax.lua") end +option("hcdnn") + set_default(true) + set_showmenu(true) + set_description("Whether to compile hcdnn for Metax GPU") +option_end() + +if has_config("hcdnn") then + add_defines("ENABLE_HCDNN_API") +end + -- 摩尔线程 option("moore-gpu") set_default(false)