1+ #include "../../../devices/moore/moore_common.h"
2+ #include "softmax_moore.h"
3+
4+ #include < cub / block / block_reduce . cuh >
5+ #include "../../../devices/moore/moore_kernel_common.h"
6+
7+ #include "../../../reduce/cuda/reduce.cuh"
8+
9+ #include "softmax_moore_kernel.h"
10+
11+ template < unsigned int BLOCK_SIZE , typename Tdata , typename Tcompute >
12+ INFINIOP_MOORE_KERNEL softmax_kernel (
13+ Tdata * y , const Tdata * x ,
14+ size_t othersize , size_t dimsize , ptrdiff_t stride ) {
15+ softmaxKernel < BLOCK_SIZE , Tdata , Tcompute > (y , x , othersize , dimsize , stride );
16+ }
17+
18+ namespace op ::softmax ::moore {
19+
20+ struct Descriptor ::Opaque {
21+ std ::shared_ptr < device ::moore ::Handle ::Internal > internal ;
22+ };
23+
24+ Descriptor ::~Descriptor () {
25+ delete _opaque ;
26+ }
27+
28+ infiniStatus_t Descriptor ::create (
29+ infiniopHandle_t handle ,
30+ Descriptor ** desc_ptr ,
31+ infiniopTensorDescriptor_t y_desc ,
32+ infiniopTensorDescriptor_t x_desc ,
33+ int axis ) {
34+ auto info = SoftmaxInfo ::create (y_desc , x_desc , axis );
35+ CHECK_RESULT (info );
36+ * desc_ptr = new Descriptor (
37+ new Opaque {reinterpret_cast < device ::moore ::Handle * > (handle )-> internal ()},
38+ info . take (), 0 , handle -> device , handle -> device_id );
39+ return INFINI_STATUS_SUCCESS ;
40+ }
41+
42+ template < unsigned int BLOCK_SIZE >
43+ infiniStatus_t launchKernel (void * y , const void * x , infiniDtype_t dtype ,
44+ size_t othersize , size_t dimsize , ptrdiff_t stride ,
45+ musaStream_t stream ) {
46+ dim3 grid (uint32_t (othersize ), 1 , 1 );
47+ if (dtype == INFINI_DTYPE_F16 ) {
48+ softmax_kernel < BLOCK_SIZE , half , float >
49+ <<< grid , BLOCK_SIZE , 0 , stream >>> ((half * )y , (const half * )x ,
50+ othersize , dimsize , stride );
51+ } else if (dtype == INFINI_DTYPE_BF16 ) {
52+ softmax_kernel < BLOCK_SIZE , __mt_bfloat16 , float >
53+ <<< grid , BLOCK_SIZE , 0 , stream >>> ((__mt_bfloat16 * )y , (const __mt_bfloat16 * )x ,
54+ othersize , dimsize , stride );
55+ } else if (dtype == INFINI_DTYPE_F32 ) {
56+ softmax_kernel < BLOCK_SIZE , float , float >
57+ <<< grid , BLOCK_SIZE , 0 , stream >>> ((float * )y , (const float * )x ,
58+ othersize , dimsize , stride );
59+ } else {
60+ return INFINI_STATUS_BAD_TENSOR_DTYPE ;
61+ }
62+ return INFINI_STATUS_SUCCESS ;
63+ }
64+
65+ infiniStatus_t Descriptor ::calculate (void * workspace , size_t workspace_size ,
66+ void * y ,
67+ const void * x ,
68+ void * stream_ ) const {
69+ musaStream_t stream = (musaStream_t )stream_ ;
70+ if (_opaque -> internal -> maxThreadsPerBlock () == MOORE_BLOCK_SIZE_1024 ) {
71+ CHECK_STATUS (launchKernel < MOORE_BLOCK_SIZE_1024 > (
72+ y , x , _info . dtype , _info . othersize , _info . dimsize , _info . stride , stream ));
73+ } else if (_opaque -> internal -> maxThreadsPerBlock () == MOORE_BLOCK_SIZE_512 ) {
74+ CHECK_STATUS (launchKernel < MOORE_BLOCK_SIZE_512 > (
75+ y , x , _info . dtype , _info . othersize , _info . dimsize , _info . stride , stream ));
76+ } else {
77+ return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED ;
78+ }
79+ return INFINI_STATUS_SUCCESS ;
80+ }
81+
82+ } // namespace op::softmax::moore
0 commit comments