2121 */
2222
2323#include " aclnn_ops.h"
24+ #include " ggml-impl.h"
2425
2526#include < aclnnop/aclnn_avgpool2d.h>
2627#include < aclnnop/aclnn_cast.h>
@@ -241,10 +242,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
241242 aclTensor* acl_src1 = ggml_cann_create_tensor (src1);
242243 aclTensor* acl_dst = ggml_cann_create_tensor (dst);
243244
244- int64_t concat_dim = 1 ;
245+ const int32_t dim = ggml_get_op_params_i32 (dst, 0 );
246+
247+ GGML_ASSERT (dim >= 0 && dim < 4 );
248+ int32_t acl_dim = 3 - dim;
249+
245250 aclTensor* tensors[] = {acl_src0, acl_src1};
246251 aclTensorList* tensorList = aclCreateTensorList (tensors, 2 );
247- aclnn_concat (ctx, tensorList, acl_dst, concat_dim );
252+ aclnn_concat (ctx, tensorList, acl_dst, acl_dim );
248253
249254 ACL_CHECK (aclDestroyTensorList (tensorList));
250255 ACL_CHECK (aclDestroyTensor (acl_dst));
@@ -1437,10 +1442,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
14371442 ggml_tensor* src0 = dst->src [0 ]; // kernel
14381443 ggml_tensor* src1 = dst->src [1 ]; // input
14391444
1440- GGML_ASSERT (src0->type == GGML_TYPE_F16);
1441- GGML_ASSERT (src1->type == GGML_TYPE_F32);
1442- GGML_ASSERT (dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
1443-
14441445 GGML_TENSOR_BINARY_OP_LOCALS;
14451446
14461447 // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@@ -1462,9 +1463,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
14621463 const int64_t OH = is_2D ? ne2 : 1 ;
14631464 const int64_t OW = ne1;
14641465
1465- GGML_ASSERT (nb00 == sizeof (ggml_fp16_t ));
1466- GGML_ASSERT (nb10 == sizeof (float ));
1467-
14681466 // memory allocated increased to 3x when is_2D == false
14691467 const int64_t n_bytes_factor = is_2D ? 1 : 3 ;
14701468
@@ -2859,15 +2857,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
28592857 ACL_CHECK (aclDestroyTensor (acl_cos_tensor));
28602858}
28612859
2860+ #ifdef __cplusplus
2861+ extern " C" {
2862+ #endif
2863+ aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize (
2864+ const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
2865+ int64_t mode, const aclTensor* yOut, uint64_t * workspaceSize,
2866+ aclOpExecutor** executor);
2867+ aclnnStatus aclnnRotaryPositionEmbedding (void * workspace,
2868+ uint64_t workspaceSize,
2869+ aclOpExecutor* executor,
2870+ aclrtStream stream);
2871+ #ifdef __cplusplus
2872+ }
2873+ #endif
2874+
28622875void ggml_cann_rope (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
28632876 // TODO: use ascendc
28642877 // Only test with LLAMA model.
28652878 ggml_tensor* src0 = dst->src [0 ]; // input
28662879 ggml_tensor* src2 = dst->src [2 ]; // freq_factors
28672880
2868- // TODO: with freq_factors
2869- GGML_ASSERT (src2 == NULL );
2870-
28712881 // param
28722882 float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
28732883 // const int n_past = ((int32_t *) dst->op_params)[0];
@@ -2885,13 +2895,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
28852895 memcpy (&beta_fast, (int32_t *)dst->op_params + 9 , sizeof (float ));
28862896 memcpy (&beta_slow, (int32_t *)dst->op_params + 10 , sizeof (float ));
28872897
2888- GGML_ASSERT (n_dims <= ne0);
2898+ // TODO: with freq_factors
2899+ GGML_ASSERT (src2 == NULL );
2900+ // TODO: attn_factor != 1
2901+ GGML_ASSERT (attn_factor == 1 );
2902+ // TODO: n_dims <= ne0
2903+ GGML_ASSERT (n_dims == ne0);
28892904 GGML_ASSERT (n_dims % 2 == 0 );
2890-
28912905 // TODO: ext_factor != 0
28922906 GGML_ASSERT (ext_factor == 0 );
28932907 // TODO: freq_scale != 1
28942908 GGML_ASSERT (freq_scale == 1 );
2909+ // TODO: type == GGML_TYPE_F16
2910+ GGML_ASSERT (src0->type == GGML_TYPE_F32);
28952911
28962912 const float theta_scale = powf (freq_base, -2 .0f / n_dims);
28972913
@@ -2924,177 +2940,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
29242940 aclnn_cache_init (ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
29252941 theta_scale, is_neox);
29262942
2927- // roll input
2928- void * input_roll_buffer;
2929- aclTensor* acl_minus_one_tensor;
2930- void * minus_one_scale_buffer = nullptr ;
2931- ggml_cann_pool_alloc roll_allocator (ctx.pool (), ggml_nbytes (src0));
2932- ggml_cann_pool_alloc minus_one_scale_allocator (
2933- ctx.pool (), sizeof (float_t ) * src0->ne [0 ]);
2934- if (!is_neox) {
2935- // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
2936- input_roll_buffer = roll_allocator.get ();
2937- int64_t input_roll_ne[4 ] = {2 , src0->ne [1 ] * (src0->ne [0 ] / 2 ),
2938- src0->ne [2 ], src0->ne [3 ]};
2939- size_t input_roll_nb[GGML_MAX_DIMS];
2940- input_roll_nb[0 ] = ggml_type_size (src0->type );
2941- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2942- input_roll_nb[i] = input_roll_nb[i - 1 ] * input_roll_ne[i - 1 ];
2943- }
2944- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor (
2945- input_roll_buffer, ggml_cann_type_mapping (src0->type ),
2946- ggml_type_size (src0->type ), input_roll_ne, input_roll_nb,
2947- GGML_MAX_DIMS);
2948- aclTensor* acl_input_tensor = ggml_cann_create_tensor (
2949- src0->data , ggml_cann_type_mapping (src0->type ),
2950- ggml_type_size (src0->type ), input_roll_ne, input_roll_nb,
2951- GGML_MAX_DIMS);
2952-
2953- int64_t shifts[] = {1 };
2954- int64_t dims[] = {3 };
2955- aclnn_roll (ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2956- ACL_CHECK (aclDestroyTensor (acl_input_roll_tensor));
2957- ACL_CHECK (aclDestroyTensor (acl_input_tensor));
2958-
2959- // init [-1, 1, -1, 1, ...]
2960- minus_one_scale_buffer = minus_one_scale_allocator.get ();
2961-
2962- int64_t minus_one_ne[4 ] = {src0->ne [0 ], 1 , 1 , 1 };
2963- size_t minus_one_nb[GGML_MAX_DIMS];
2964- minus_one_nb[0 ] = sizeof (float_t );
2965- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2966- minus_one_nb[i] = minus_one_nb[i - 1 ] * minus_one_ne[i - 1 ];
2967- }
2968- acl_minus_one_tensor = aclnn_ones (
2969- ctx, minus_one_scale_buffer, sizeof (float_t ) * src0->ne [0 ],
2970- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof (float_t ), 1 );
2971- int64_t dim = 3 ;
2972- int64_t * index = new int64_t [src0->ne [0 ]];
2973- for (int i = 0 ; i < src0->ne [0 ]; i++) {
2974- index[i] = i / 2 * 2 ;
2975- }
2976- int64_t index_num = src0->ne [0 ];
2977- float value = -1 ;
2978- aclnn_index_fill_tensor (ctx, acl_minus_one_tensor, dim, index,
2979- index_num, value);
2980- } else {
2981- // roll input: [q0,q1,q2,...] ->
2982- // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
2983- input_roll_buffer = roll_allocator.get ();
2984- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor (
2985- input_roll_buffer, ggml_cann_type_mapping (src0->type ),
2986- ggml_type_size (src0->type ), src0->ne , src0->nb , GGML_MAX_DIMS);
2987- aclTensor* acl_input_tensor = ggml_cann_create_tensor (src0);
2988-
2989- int64_t shifts[] = {src0->ne [0 ] / 2 };
2990- int64_t dims[] = {3 };
2991- aclnn_roll (ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
2992-
2993- ACL_CHECK (aclDestroyTensor (acl_input_roll_tensor));
2994- ACL_CHECK (aclDestroyTensor (acl_input_tensor));
2943+ uint64_t workspaceSize = 0 ;
2944+ aclOpExecutor* executor;
29952945
2996- // init [-1, -1, -1, 1, 1,1,...]
2997- minus_one_scale_buffer = minus_one_scale_allocator.get ();
2946+ void * workspaceAddr = nullptr ;
29982947
2999- int64_t minus_one_ne[4 ] = {src0->ne [0 ], 1 , 1 , 1 };
3000- size_t minus_one_nb[GGML_MAX_DIMS];
3001- minus_one_nb[0 ] = sizeof (float_t );
3002- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
3003- minus_one_nb[i] = minus_one_nb[i - 1 ] * minus_one_ne[i - 1 ];
3004- }
3005- acl_minus_one_tensor = aclnn_ones (
3006- ctx, minus_one_scale_buffer, sizeof (float_t ) * src0->ne [0 ],
3007- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof (float_t ), 1 );
3008- // -1 * first half
3009- int64_t first_half_ne[4 ] = {src0->ne [0 ] / 2 , 1 , 1 , 1 };
3010- size_t first_half_nb[GGML_MAX_DIMS];
3011- first_half_nb[0 ] = sizeof (float_t );
3012- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
3013- first_half_nb[i] = first_half_nb[i - 1 ] * first_half_ne[i - 1 ];
3014- }
3015- aclTensor* acl_first_half_tensor = ggml_cann_create_tensor (
3016- minus_one_scale_buffer, ACL_FLOAT, sizeof (float_t ), first_half_ne,
3017- first_half_nb, GGML_MAX_DIMS);
3018- bool inplace = true ;
3019- float scale = -1 ;
3020- aclnn_muls (ctx, acl_first_half_tensor, scale, nullptr , inplace);
3021- ACL_CHECK (aclDestroyTensor (acl_first_half_tensor));
3022- }
3023-
3024- // TODO: n_dims < ne0
3025- GGML_ASSERT (n_dims == src0->ne [0 ]);
3026-
3027- // input * scale
3028- ggml_cann_pool_alloc roll_mul_scale_allocator (ctx.pool (),
3029- ggml_nbytes (src0));
3030- void * input_roll_mul_scale_buffer = roll_mul_scale_allocator.get ();
3031- size_t input_nb[GGML_MAX_DIMS];
3032- input_nb[0 ] = ggml_type_size (src0->type );
3033- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
3034- input_nb[i] = input_nb[i - 1 ] * src0->ne [i - 1 ];
2948+ int acl_mode = mode;
2949+ if (mode == 0 ) {
2950+ acl_mode = 1 ;
30352951 }
3036- aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor (
3037- input_roll_mul_scale_buffer, ggml_cann_type_mapping (src0->type ),
3038- ggml_type_size (src0->type ), src0->ne , input_nb, GGML_MAX_DIMS);
3039- aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor (
3040- input_roll_buffer, ggml_cann_type_mapping (src0->type ),
3041- ggml_type_size (src0->type ), src0->ne , input_nb, GGML_MAX_DIMS);
30422952
3043- aclnn_mul (ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
3044- acl_input_roll_mul_scale_tensor);
3045-
3046- // output
3047- aclTensor* acl_src0 = ggml_cann_create_tensor (src0);
2953+ aclTensor* acl_x = ggml_cann_create_tensor (src0);
30482954 aclTensor* acl_dst = ggml_cann_create_tensor (dst);
3049- void * output_fp32_buffer;
3050- if (src0->type == GGML_TYPE_F32) {
3051- aclnn_inplace_mul (ctx, acl_src0, acl_cos_reshape_tensor);
3052- aclnn_inplace_mul (ctx, acl_input_roll_mul_scale_tensor,
3053- acl_sin_reshape_tensor);
3054- aclnn_add (ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
3055- // TODO: ne0 != n_dims in mode2
3056- } else if (src0->type == GGML_TYPE_F16) {
3057- size_t input_fp32_nb[GGML_MAX_DIMS];
3058- input_fp32_nb[0 ] = sizeof (float_t );
3059- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
3060- input_fp32_nb[i] = input_fp32_nb[i - 1 ] * dst->ne [i - 1 ];
3061- }
3062- ggml_cann_pool_alloc fp32_allocator1 (
3063- ctx.pool (), ggml_nelements (dst) * sizeof (float_t ));
3064- void * input_fp32_buffer1 = fp32_allocator1.get ();
3065- aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor (
3066- input_fp32_buffer1, ACL_FLOAT, sizeof (float_t ), dst->ne ,
3067- input_fp32_nb, GGML_MAX_DIMS);
3068- ggml_cann_pool_alloc fp32_allocator2 (
3069- ctx.pool (), ggml_nelements (dst) * sizeof (float_t ));
3070- void * input_fp32_buffer2 = fp32_allocator2.get ();
3071- aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor (
3072- input_fp32_buffer2, ACL_FLOAT, sizeof (float_t ), dst->ne ,
3073- input_fp32_nb, GGML_MAX_DIMS);
3074-
3075- ggml_cann_pool_alloc fp32_allocator (
3076- ctx.pool (), ggml_nelements (dst) * sizeof (float_t ));
3077- output_fp32_buffer = fp32_allocator.get ();
3078- aclTensor* output_fp32_tensor = ggml_cann_create_tensor (
3079- output_fp32_buffer, ACL_FLOAT, sizeof (float_t ), dst->ne ,
3080- input_fp32_nb, GGML_MAX_DIMS);
3081- aclnn_mul (ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
3082- aclnn_mul (ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
3083- input_fp32_tensor2);
3084- aclnn_add (ctx, input_fp32_tensor1, input_fp32_tensor2,
3085- output_fp32_tensor);
3086- aclnn_cast (ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
3087-
3088- ACL_CHECK (aclDestroyTensor (input_fp32_tensor1));
3089- ACL_CHECK (aclDestroyTensor (input_fp32_tensor2));
3090- ACL_CHECK (aclDestroyTensor (output_fp32_tensor));
2955+ ACL_CHECK (aclnnRotaryPositionEmbeddingGetWorkspaceSize (
2956+ acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
2957+ if (workspaceSize > 0 ) {
2958+ ggml_cann_pool_alloc workspace_allocator (ctx.pool (), workspaceSize);
2959+ workspaceAddr = workspace_allocator.get ();
30912960 }
30922961
3093- ACL_CHECK (aclDestroyTensor (acl_sin_reshape_tensor));
2962+ ACL_CHECK (aclnnRotaryPositionEmbedding (workspaceAddr, workspaceSize,
2963+ executor, ctx.stream ()));
2964+
2965+ ACL_CHECK (aclDestroyTensor (acl_x));
30942966 ACL_CHECK (aclDestroyTensor (acl_cos_reshape_tensor));
3095- ACL_CHECK (aclDestroyTensor (acl_minus_one_tensor));
3096- ACL_CHECK (aclDestroyTensor (acl_input_roll_mul_scale_tensor));
3097- ACL_CHECK (aclDestroyTensor (acl_input_roll_reshape_tensor));
3098- ACL_CHECK (aclDestroyTensor (acl_src0));
2967+ ACL_CHECK (aclDestroyTensor (acl_sin_reshape_tensor));
30992968 ACL_CHECK (aclDestroyTensor (acl_dst));
31002969}
0 commit comments