6464#include < aclnnop/aclnn_reflection_pad1d.h>
6565#include < aclnnop/aclnn_eq_tensor.h>
6666#include < aclnnop/aclnn_gt_scalar.h>
67+ #include < aclnnop/aclnn_pow.h>
6768#include < float.h>
6869
6970#include < cmath>
@@ -144,23 +145,6 @@ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
144145 GGML_CANN_CALL_ACLNN_OP (Cast, acl_src, cast_data_type, acl_dst);
145146}
146147
147- /* *
148- * @brief Casts the elements of a tensor to a specified data type using the CANN backend.
149- *
150- * @details This function performs a type conversion on the elements of the input tensor `acl_src`
151- * and stores the results in the destination tensor `acl_dst`. The conversion type is
152- * determined based on the `dst` tensor's data type.
153- *
154- * @param ctx The context for the CANN backend operations.
155- * @param acl_src The source tensor whose elements will be cast.
156- * @param acl_dst The destination tensor that will store the casted elements.
157- * @param dst The ggml tensor specifying the target data type.
158- */
159- static void aclnn_cast (ggml_backend_cann_context& ctx, aclTensor* acl_src,
160- aclTensor* acl_dst, ggml_tensor* dst) {
161- aclnn_cast (ctx, acl_src, acl_dst, ggml_cann_type_mapping (dst->type ));
162- }
163-
164148void ggml_cann_repeat (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
165149 ggml_tensor* src = dst->src [0 ];
166150 GGML_ASSERT (ggml_can_repeat (src, dst));
@@ -767,7 +751,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
767751 if (dst->type == src0->type ) {
768752 cann_copy (ctx, acl_src, acl_dst);
769753 } else {
770- aclnn_cast (ctx, acl_src, acl_dst, dst);
754+ aclnn_cast (ctx, acl_src, acl_dst, ggml_cann_type_mapping ( dst-> type ) );
771755 }
772756 } else {
773757 if (ggml_is_contiguous (src0) && ggml_is_contiguous (dst)) {
@@ -792,7 +776,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
792776 ggml_type_size (dst->type ), src0->ne , src_trans_nb,
793777 GGML_MAX_DIMS);
794778
795- aclnn_cast (ctx, acl_src, src_trans_tensor, dst);
779+ aclnn_cast (ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping ( dst-> type ) );
796780 size_t cpy_size = ggml_nbytes (dst);
797781 ACL_CHECK (aclrtMemcpyAsync (
798782 dst->data , cpy_size, src_trans_buffer, cpy_size,
@@ -814,7 +798,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
814798 ggml_type_size (dst->type ), src0->ne , src_trans_nb,
815799 GGML_MAX_DIMS);
816800
817- aclnn_cast (ctx, acl_src, src_trans_tensor, dst);
801+ aclnn_cast (ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping ( dst-> type ) );
818802
819803 size_t cpy_size = ggml_nbytes (dst);
820804 ACL_CHECK (aclrtMemcpyAsync (dst->data , cpy_size, src_trans_buffer,
@@ -1158,7 +1142,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
11581142 tmp_cast_buffer, ggml_cann_type_mapping (dst->type ),
11591143 ggml_type_size (dst->type ), tmp_im2col_ne, temp_cast_nb,
11601144 GGML_MAX_DIMS - 1 , ACL_FORMAT_ND);
1161- aclnn_cast (ctx, tmp_im2col_tensor, tmp_cast_tensor, dst);
1145+ aclnn_cast (ctx, tmp_im2col_tensor, tmp_cast_tensor, ggml_cann_type_mapping ( dst-> type ) );
11621146 }
11631147
11641148 // post-processing
@@ -1733,7 +1717,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
17331717 aclTensor* src_trans_tensor = ggml_cann_create_tensor (
17341718 src_trans_buffer, ACL_FLOAT, ggml_type_size (dst->type ),
17351719 src0->ne , src_trans_nb, GGML_MAX_DIMS);
1736- aclnn_cast (ctx, acl_src0, src_trans_tensor, dst);
1720+ aclnn_cast (ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping ( dst-> type ) );
17371721 aclnn_embedding_4d (ctx, src_trans_buffer, src0->ne ,
17381722 src_trans_nb, src1, dst);
17391723 ACL_CHECK (aclDestroyTensor (acl_src0));
@@ -2074,7 +2058,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
20742058 output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
20752059 output_cast_nb, GGML_MAX_DIMS);
20762060 aclTensor* acl_dst_tensor = ggml_cann_create_tensor (dst);
2077- aclnn_cast (ctx, acl_output_tensor, acl_dst_tensor, dst);
2061+ aclnn_cast (ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping ( dst-> type ) );
20782062
20792063 ACL_CHECK (aclDestroyTensor (acl_output_tensor));
20802064 ACL_CHECK (aclDestroyTensor (acl_dst_tensor));
@@ -2159,37 +2143,29 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
21592143 ggml_tensor* src1 = dst->src [1 ]; // position
21602144 ggml_tensor* src2 = dst->src [2 ]; // freq_factors
21612145
2162- // arange, [0,1,...,ne0/2]
2163- int64_t arange_length = src0->ne [0 ] / 2 ;
2164- ggml_cann_pool_alloc arange_allocator (ctx.pool (),
2165- arange_length * sizeof (float_t ));
2166- void * arange_buffer = arange_allocator.get ();
2167- int64_t arange_ne[] = {arange_length, 1 , 1 , 1 };
2168- size_t arange_nb[] = {sizeof (float_t ), sizeof (float_t ), sizeof (float_t ),
2169- arange_length * sizeof (float_t )};
2170-
2171- aclTensor* acl_arange_tensor =
2172- ggml_cann_create_tensor (arange_buffer, ACL_FLOAT, sizeof (float_t ),
2173- arange_ne, arange_nb, GGML_MAX_DIMS);
2146+ GGML_TENSOR_BINARY_OP_LOCALS
2147+
2148+ // theta_scale arange, [0,1,...,ne00/2 - 1]
2149+ int64_t theta_scale_length = ne00 / 2 ;
2150+ ggml_cann_pool_alloc theta_scale_allocator (ctx.pool (),
2151+ theta_scale_length * sizeof (float_t ));
2152+ void * theta_scale_buffer = theta_scale_allocator.get ();
2153+ int64_t theta_scale_ne[] = {theta_scale_length, 1 , 1 , 1 };
2154+ size_t theta_scale_nb[] = {sizeof (float_t ), sizeof (float_t ), sizeof (float_t ),
2155+ theta_scale_length * sizeof (float_t )};
2156+
2157+ aclTensor* acl_theta_scale_tensor =
2158+ ggml_cann_create_tensor (theta_scale_buffer, ACL_FLOAT, sizeof (float_t ),
2159+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
21742160 float start = 0 ;
21752161 float step = 1 ;
2176- float stop = src0-> ne [ 0 ] / 2 ;
2177- float n_elements = src0-> ne [ 0 ] / 2 ;
2178- aclnn_arange (ctx, acl_arange_tensor , start, stop, step, n_elements);
2162+ float stop = ne00 / 2 ;
2163+ float n_elements = ne00 / 2 ;
2164+ aclnn_arange (ctx, acl_theta_scale_tensor , start, stop, step, n_elements);
21792165
21802166 // power
2181- // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
2182- // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
2183- // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2184- // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
2185- // acl_power_tensor);
2186- ggml_cann_pool_alloc theta_scale_allocator (ctx.pool (),
2187- arange_length * sizeof (float_t ));
2188- void * theta_scale_buffer = theta_scale_allocator.get ();
2189- aclTensor* acl_theta_scale_tensor = aclnn_values (
2190- ctx, theta_scale_buffer, arange_length * sizeof (float_t ), arange_ne,
2191- GGML_MAX_DIMS, ACL_FLOAT, sizeof (float_t ), theta_scale);
2192- aclnn_pow_tensor_tensor (ctx, acl_theta_scale_tensor, acl_arange_tensor);
2167+ aclScalar* acl_theta_scale = aclCreateScalar (&theta_scale, aclDataType::ACL_FLOAT);
2168+ GGML_CANN_CALL_ACLNN_OP (PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, acl_theta_scale_tensor);
21932169
21942170 // freq_scale
21952171 if (freq_scale != 1 ) {
@@ -2200,28 +2176,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
22002176 if (src2) {
22012177 aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor (
22022178 src2->data , ggml_cann_type_mapping (src2->type ),
2203- ggml_type_size (src2->type ), arange_ne, arange_nb , GGML_MAX_DIMS);
2179+ ggml_type_size (src2->type ), theta_scale_ne, theta_scale_nb , GGML_MAX_DIMS);
22042180 aclnn_div (ctx, acl_theta_scale_tensor, acl_freq_factors_tensor);
22052181 ACL_CHECK (aclDestroyTensor (acl_freq_factors_tensor));
22062182 }
22072183
22082184 // position
22092185 GGML_ASSERT (src1->type == GGML_TYPE_I32);
22102186 int64_t position_length = src1->ne [0 ];
2211- int64_t position_ne[] = {1 , position_length, 1 , 1 };
2212- size_t position_nb[] = {sizeof (int32_t ), sizeof (int32_t ),
2213- sizeof (int32_t ) * position_length,
2187+ int64_t position_ne[] = {1 , 1 , position_length, 1 };
2188+ size_t position_nb[] = {sizeof (int32_t ), sizeof (int32_t ), sizeof (int32_t ),
22142189 sizeof (int32_t ) * position_length};
22152190 aclTensor* acl_position_tensor = ggml_cann_create_tensor (
22162191 src1->data , ggml_cann_type_mapping (src1->type ),
22172192 ggml_type_size (src1->type ), position_ne, position_nb, GGML_MAX_DIMS);
22182193
22192194 // power * position
2220- int64_t theta_length = arange_length * position_length;
2195+ int64_t theta_length = theta_scale_length * position_length;
22212196 ggml_cann_pool_alloc theta_allocator (ctx.pool (),
22222197 theta_length * sizeof (float_t ));
22232198 void * theta_buffer = theta_allocator.get ();
2224- int64_t theta_ne[] = {arange_length, position_length, 1 , 1 };
2199+ int64_t theta_ne[] = {theta_scale_length, 1 , position_length , 1 };
22252200 size_t theta_nb[GGML_MAX_DIMS];
22262201 theta_nb[0 ] = sizeof (float_t );
22272202 for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
@@ -2233,40 +2208,22 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
22332208 aclnn_mul (ctx, acl_position_tensor, acl_theta_scale_tensor,
22342209 acl_theta_tensor);
22352210
2236- // permute: [0,1,2,3]->[0,2,1,3]
2237- int64_t permute_ne[] = {arange_length, 1 , position_length, 1 };
2238- size_t permute_nb[GGML_MAX_DIMS];
2239- permute_nb[0 ] = sizeof (float_t );
2240- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2241- permute_nb[i] = permute_nb[i - 1 ] * permute_ne[i - 1 ];
2242- }
2243- ggml_cann_pool_alloc permute_allocator (ctx.pool (),
2244- theta_length * sizeof (float_t ));
2245- void * permute_buffer = permute_allocator.get ();
2246- aclTensor* acl_permute_tensor = ggml_cann_create_tensor (
2247- permute_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb,
2248- GGML_MAX_DIMS, ACL_FORMAT_ND);
2249- int64_t permute_dim[] = {0 , 2 , 1 , 3 };
2250- int64_t num_dims = 4 ;
2251- aclnn_permute (ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
2252- num_dims);
2253-
22542211 // sin/cos
22552212 ggml_cann_pool_alloc sin_allocator (ctx.pool (),
22562213 theta_length * sizeof (float_t ));
22572214 void * sin_buffer = sin_allocator.get ();
22582215 aclTensor* acl_sin_tensor = ggml_cann_create_tensor (
2259- sin_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb ,
2216+ sin_buffer, ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb ,
22602217 GGML_MAX_DIMS, ACL_FORMAT_ND);
2261- aclnn_sin (ctx, acl_permute_tensor , acl_sin_tensor);
2218+ aclnn_sin (ctx, acl_theta_tensor , acl_sin_tensor);
22622219
22632220 ggml_cann_pool_alloc cos_allocator (ctx.pool (),
22642221 theta_length * sizeof (float_t ));
22652222 void * cos_buffer = cos_allocator.get ();
22662223 aclTensor* acl_cos_tensor = ggml_cann_create_tensor (
2267- cos_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb ,
2224+ cos_buffer, ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb ,
22682225 GGML_MAX_DIMS, ACL_FORMAT_ND);
2269- aclnn_cos (ctx, acl_permute_tensor , acl_cos_tensor);
2226+ aclnn_cos (ctx, acl_theta_tensor , acl_cos_tensor);
22702227
22712228 // attn_factor
22722229 if (attn_factor != 1 ) {
@@ -2282,21 +2239,20 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
22822239 } else {
22832240 int64_t num_repeats = 2 ;
22842241 int64_t dim = 3 ;
2285- int64_t output_size = arange_length * num_repeats;
2242+ int64_t output_size = theta_scale_length * num_repeats;
22862243 aclnn_repeat_interleave (ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
22872244 num_repeats, output_size);
22882245 aclnn_repeat_interleave (ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
22892246 num_repeats, output_size);
22902247 }
22912248
22922249 // release
2293- ACL_CHECK (aclDestroyTensor (acl_arange_tensor));
22942250 ACL_CHECK (aclDestroyTensor (acl_theta_scale_tensor));
22952251 ACL_CHECK (aclDestroyTensor (acl_position_tensor));
22962252 ACL_CHECK (aclDestroyTensor (acl_theta_tensor));
2297- ACL_CHECK (aclDestroyTensor (acl_permute_tensor));
22982253 ACL_CHECK (aclDestroyTensor (acl_sin_tensor));
22992254 ACL_CHECK (aclDestroyTensor (acl_cos_tensor));
2255+ ACL_CHECK (aclDestroyScalar (acl_theta_scale));
23002256}
23012257
23022258#ifdef __cplusplus
@@ -2318,7 +2274,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23182274 // TODO: use ascendc
23192275 // Only test with LLAMA model.
23202276 ggml_tensor* src0 = dst->src [0 ]; // input
2321- // ggml_tensor* src2 = dst->src[2]; // freq_factors, not used now.
23222277
23232278 // param
23242279 float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -2353,13 +2308,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23532308
23542309 // init cos/sin cache
23552310 ggml_cann_pool_alloc sin_allocator (
2356- ctx.pool (), src0-> ne [ 0 ] * src0-> ne [ 2 ] * sizeof (float_t ));
2311+ ctx.pool (), ne00 * ne02 * sizeof (float_t ));
23572312 ggml_cann_pool_alloc cos_allocator (
2358- ctx.pool (), src0-> ne [ 0 ] * src0-> ne [ 2 ] * sizeof (float_t ));
2313+ ctx.pool (), ne00 * ne02 * sizeof (float_t ));
23592314 void * sin_buffer = sin_allocator.get ();
23602315 void * cos_buffer = cos_allocator.get ();
23612316
2362- int64_t sin_reshape_ne[4 ] = {src0-> ne [ 0 ] , 1 , src0-> ne [ 2 ] , 1 };
2317+ int64_t sin_reshape_ne[4 ] = {ne00 , 1 , ne02 , 1 };
23632318 size_t sin_reshape_nb[GGML_MAX_DIMS];
23642319 sin_reshape_nb[0 ] = sizeof (float_t );
23652320 for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
@@ -2372,7 +2327,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23722327 ggml_cann_create_tensor (cos_buffer, ACL_FLOAT, sizeof (float_t ),
23732328 sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
23742329 aclnn_cache_init (ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
2375- theta_scale, freq_scale, attn_factor, is_neox);
2330+ theta_scale, freq_scale, attn_factor, is_neox);
23762331
23772332 aclTensor* acl_src = ggml_cann_create_tensor (src0);
23782333 aclTensor* acl_dst = ggml_cann_create_tensor (dst);
@@ -2549,46 +2504,51 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
25492504 return ;
25502505#endif
25512506
2552- // src0 == GGML_TYPE_F16
2553- // TODO: optimization this `if` code
2554- if (src0->type == GGML_TYPE_F16) {
2555- ggml_cann_pool_alloc sin_final_allocator (
2556- ctx.pool (), src0->ne [0 ] * src0->ne [2 ] * ggml_type_size (src0->type ));
2557- ggml_cann_pool_alloc cos_final_allocator (
2558- ctx.pool (), src0->ne [0 ] * src0->ne [2 ] * ggml_type_size (src0->type ));
2559- void * sin_final_buffer = sin_final_allocator.get ();
2560- void * cos_final_buffer = cos_final_allocator.get ();
2561-
2562- int64_t sin_final_ne[4 ] = {src0->ne [0 ], 1 , src0->ne [2 ], 1 };
2563- size_t sin_final_nb[GGML_MAX_DIMS];
2564- sin_final_nb[0 ] = ggml_type_size (src0->type );
2565- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2566- sin_final_nb[i] = sin_final_nb[i - 1 ] * sin_final_ne[i - 1 ];
2507+ // ggml_mode = 0 --> aclnn_model = 1
2508+ int64_t acl_mode = mode == 0 ? 1 : mode;
2509+
2510+ switch (src0->type ) {
2511+ case GGML_TYPE_F32: {
2512+ GGML_CANN_CALL_ACLNN_OP (RotaryPositionEmbedding, acl_src, acl_cos_reshape_tensor,
2513+ acl_sin_reshape_tensor, acl_mode, acl_dst);
2514+ break ;
25672515 }
2568- aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor (
2569- sin_final_buffer, ggml_cann_type_mapping (src0->type ),
2570- ggml_type_size (src0->type ), sin_final_ne, sin_final_nb,
2571- GGML_MAX_DIMS);
2572- aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor (
2573- cos_final_buffer, ggml_cann_type_mapping (src0->type ),
2574- ggml_type_size (src0->type ), sin_final_ne, sin_final_nb,
2575- GGML_MAX_DIMS);
2516+ case GGML_TYPE_F16: {
2517+ ggml_cann_pool_alloc src_trans_allocator (
2518+ ctx.pool (), ggml_nelements (src0) * sizeof (float ));
2519+ void * src_trans_buffer = src_trans_allocator.get ();
2520+ ggml_cann_pool_alloc dst_trans_allocator (
2521+ ctx.pool (), ggml_nelements (dst) * sizeof (float ));
2522+ void * dst_trans_buffer = dst_trans_allocator.get ();
25762523
2577- aclnn_cast (ctx, acl_sin_reshape_tensor, acl_sin_final_tensor, dst);
2578- aclnn_cast (ctx, acl_cos_reshape_tensor, acl_cos_final_tensor, dst);
2579- ACL_CHECK (aclDestroyTensor (acl_cos_reshape_tensor));
2580- ACL_CHECK (aclDestroyTensor (acl_sin_reshape_tensor));
2581- acl_sin_reshape_tensor = acl_sin_final_tensor;
2582- acl_cos_reshape_tensor = acl_cos_final_tensor;
2583- }
2524+ size_t src_trans_nb[GGML_MAX_DIMS];
2525+ src_trans_nb[0 ] = sizeof (float );
2526+ for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2527+ src_trans_nb[i] = src_trans_nb[i - 1 ] * src0->ne [i - 1 ];
2528+ }
25842529
2585- int acl_mode = mode;
2586- if (mode == 0 ) {
2587- acl_mode = 1 ;
2588- }
2530+ aclTensor* acl_src_trans_tensor = ggml_cann_create_tensor (
2531+ src_trans_buffer, ACL_FLOAT, sizeof (float ), src0->ne , src_trans_nb,
2532+ GGML_MAX_DIMS);
2533+ aclTensor* acl_dst_trans_tensor = ggml_cann_create_tensor (
2534+ dst_trans_buffer, ACL_FLOAT, sizeof (float ), dst->ne , src_trans_nb,
2535+ GGML_MAX_DIMS);
2536+
2537+ aclnn_cast (ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
2538+
2539+ GGML_CANN_CALL_ACLNN_OP (RotaryPositionEmbedding, acl_src_trans_tensor, acl_cos_reshape_tensor,
2540+ acl_sin_reshape_tensor, acl_mode, acl_dst_trans_tensor);
2541+
2542+ aclnn_cast (ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16);
25892543
2590- GGML_CANN_CALL_ACLNN_OP (RotaryPositionEmbedding, acl_src, acl_cos_reshape_tensor,
2591- acl_sin_reshape_tensor, acl_mode, acl_dst);
2544+ ACL_CHECK (aclDestroyTensor (acl_src_trans_tensor));
2545+ ACL_CHECK (aclDestroyTensor (acl_dst_trans_tensor));
2546+ break ;
2547+ }
2548+ default :
2549+ GGML_ABORT (" Unsupported tensor type for GGML_OP_ROPE" );
2550+ break ;
2551+ }
25922552 ACL_CHECK (aclDestroyTensor (acl_src));
25932553 ACL_CHECK (aclDestroyTensor (acl_cos_reshape_tensor));
25942554 ACL_CHECK (aclDestroyTensor (acl_sin_reshape_tensor));
0 commit comments