6464#include < aclnnop/aclnn_reflection_pad1d.h>
6565#include < aclnnop/aclnn_eq_tensor.h>
6666#include < aclnnop/aclnn_gt_scalar.h>
67+ #include < aclnnop/aclnn_pow.h>
6768#include < float.h>
6869
6970#include < cmath>
@@ -2159,69 +2160,60 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
21592160 ggml_tensor* src1 = dst->src [1 ]; // position
21602161 ggml_tensor* src2 = dst->src [2 ]; // freq_factors
21612162
2162- // arange, [0,1,...,ne0/2]
2163- int64_t arange_length = src0->ne [0 ] / 2 ;
2164- ggml_cann_pool_alloc arange_allocator (ctx.pool (),
2165- arange_length * sizeof (float_t ));
2166- void * arange_buffer = arange_allocator.get ();
2167- int64_t arange_ne[] = {arange_length, 1 , 1 , 1 };
2168- size_t arange_nb[] = {sizeof (float_t ), sizeof (float_t ), sizeof (float_t ),
2169- arange_length * sizeof (float_t )};
2170-
2171- aclTensor* acl_arange_tensor =
2172- ggml_cann_create_tensor (arange_buffer, ACL_FLOAT, sizeof (float_t ),
2173- arange_ne, arange_nb, GGML_MAX_DIMS);
2174- float start = 0 ;
2175- float step = 1 ;
2176- float stop = src0->ne [0 ] / 2 ;
2177- float n_elements = src0->ne [0 ] / 2 ;
2178- aclnn_arange (ctx, acl_arange_tensor, start, stop, step, n_elements);
2163+ GGML_TENSOR_BINARY_OP_LOCALS
21792164
2180- // power
2181- // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
2182- // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
2183- // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2184- // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
2185- // acl_power_tensor);
2165+ // theta_scale arange, [0,1,...,ne0/2]
2166+ int64_t theta_scale_length = ne00 / 2 ;
21862167 ggml_cann_pool_alloc theta_scale_allocator (ctx.pool (),
2187- arange_length * sizeof (float_t ));
2168+ theta_scale_length * sizeof (float_t ));
21882169 void * theta_scale_buffer = theta_scale_allocator.get ();
2189- aclTensor* acl_theta_scale_tensor = aclnn_values (
2190- ctx, theta_scale_buffer, arange_length * sizeof (float_t ), arange_ne,
2191- GGML_MAX_DIMS, ACL_FLOAT, sizeof (float_t ), theta_scale);
2192- aclnn_pow_tensor_tensor (ctx, acl_theta_scale_tensor, acl_arange_tensor);
2170+ int64_t theta_scale_ne[] = {theta_scale_length, 1 , 1 , 1 };
2171+ size_t theta_scale_nb[] = {sizeof (float_t ), sizeof (float_t ), sizeof (float_t ),
2172+ theta_scale_length * sizeof (float_t )};
2173+
2174+ aclTensor* acl_theat_scale_tensor =
2175+ ggml_cann_create_tensor (theta_scale_buffer, ACL_FLOAT, sizeof (float_t ),
2176+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2177+ float start = 0 ;
2178+ float step = 1 ;
2179+ float stop = ne00 / 2 ;
2180+ float n_elements = ne00 / 2 ;
2181+ aclnn_arange (ctx, acl_theat_scale_tensor, start, stop, step, n_elements);
21932182
2183+ // power
2184+ aclScalar* acl_theta_scale = aclCreateScalar (&theta_scale, aclDataType::ACL_FLOAT);
2185+ GGML_CANN_CALL_ACLNN_OP (PowScalarTensor, acl_theta_scale, acl_theat_scale_tensor, acl_theat_scale_tensor);
2186+
21942187 // freq_scale
21952188 if (freq_scale != 1 ) {
2196- aclnn_muls (ctx, acl_theta_scale_tensor , freq_scale, nullptr , true );
2189+ aclnn_muls (ctx, acl_theat_scale_tensor , freq_scale, nullptr , true );
21972190 }
21982191
21992192 // freq_factors
22002193 if (src2) {
22012194 aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor (
22022195 src2->data , ggml_cann_type_mapping (src2->type ),
2203- ggml_type_size (src2->type ), arange_ne, arange_nb , GGML_MAX_DIMS);
2204- aclnn_div (ctx, acl_theta_scale_tensor , acl_freq_factors_tensor);
2196+ ggml_type_size (src2->type ), theta_scale_ne, theta_scale_nb , GGML_MAX_DIMS);
2197+ aclnn_div (ctx, acl_theat_scale_tensor , acl_freq_factors_tensor);
22052198 ACL_CHECK (aclDestroyTensor (acl_freq_factors_tensor));
22062199 }
22072200
22082201 // position
22092202 GGML_ASSERT (src1->type == GGML_TYPE_I32);
22102203 int64_t position_length = src1->ne [0 ];
2211- int64_t position_ne[] = {1 , position_length, 1 , 1 };
2212- size_t position_nb[] = {sizeof (int32_t ), sizeof (int32_t ),
2213- sizeof (int32_t ) * position_length,
2204+ int64_t position_ne[] = {1 , 1 , position_length, 1 };
2205+ size_t position_nb[] = {sizeof (int32_t ), sizeof (int32_t ), sizeof (int32_t ),
22142206 sizeof (int32_t ) * position_length};
22152207 aclTensor* acl_position_tensor = ggml_cann_create_tensor (
22162208 src1->data , ggml_cann_type_mapping (src1->type ),
22172209 ggml_type_size (src1->type ), position_ne, position_nb, GGML_MAX_DIMS);
22182210
22192211 // power * position
2220- int64_t theta_length = arange_length * position_length;
2212+ int64_t theta_length = theta_scale_length * position_length;
22212213 ggml_cann_pool_alloc theta_allocator (ctx.pool (),
22222214 theta_length * sizeof (float_t ));
22232215 void * theta_buffer = theta_allocator.get ();
2224- int64_t theta_ne[] = {arange_length, position_length, 1 , 1 };
2216+ int64_t theta_ne[] = {theta_scale_length, 1 , position_length , 1 };
22252217 size_t theta_nb[GGML_MAX_DIMS];
22262218 theta_nb[0 ] = sizeof (float_t );
22272219 for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
@@ -2230,43 +2222,25 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
22302222 aclTensor* acl_theta_tensor =
22312223 ggml_cann_create_tensor (theta_buffer, ACL_FLOAT, sizeof (float_t ),
22322224 theta_ne, theta_nb, GGML_MAX_DIMS);
2233- aclnn_mul (ctx, acl_position_tensor, acl_theta_scale_tensor ,
2225+ aclnn_mul (ctx, acl_position_tensor, acl_theat_scale_tensor ,
22342226 acl_theta_tensor);
22352227
2236- // permute: [0,1,2,3]->[0,2,1,3]
2237- int64_t permute_ne[] = {arange_length, 1 , position_length, 1 };
2238- size_t permute_nb[GGML_MAX_DIMS];
2239- permute_nb[0 ] = sizeof (float_t );
2240- for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
2241- permute_nb[i] = permute_nb[i - 1 ] * permute_ne[i - 1 ];
2242- }
2243- ggml_cann_pool_alloc permute_allocator (ctx.pool (),
2244- theta_length * sizeof (float_t ));
2245- void * permute_buffer = permute_allocator.get ();
2246- aclTensor* acl_permute_tensor = ggml_cann_create_tensor (
2247- permute_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb,
2248- GGML_MAX_DIMS, ACL_FORMAT_ND);
2249- int64_t permute_dim[] = {0 , 2 , 1 , 3 };
2250- int64_t num_dims = 4 ;
2251- aclnn_permute (ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
2252- num_dims);
2253-
22542228 // sin/cos
22552229 ggml_cann_pool_alloc sin_allocator (ctx.pool (),
22562230 theta_length * sizeof (float_t ));
22572231 void * sin_buffer = sin_allocator.get ();
22582232 aclTensor* acl_sin_tensor = ggml_cann_create_tensor (
2259- sin_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb ,
2233+ sin_buffer, ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb ,
22602234 GGML_MAX_DIMS, ACL_FORMAT_ND);
2261- aclnn_sin (ctx, acl_permute_tensor , acl_sin_tensor);
2235+ aclnn_sin (ctx, acl_theta_tensor , acl_sin_tensor);
22622236
22632237 ggml_cann_pool_alloc cos_allocator (ctx.pool (),
22642238 theta_length * sizeof (float_t ));
22652239 void * cos_buffer = cos_allocator.get ();
22662240 aclTensor* acl_cos_tensor = ggml_cann_create_tensor (
2267- cos_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb ,
2241+ cos_buffer, ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb ,
22682242 GGML_MAX_DIMS, ACL_FORMAT_ND);
2269- aclnn_cos (ctx, acl_permute_tensor , acl_cos_tensor);
2243+ aclnn_cos (ctx, acl_theta_tensor , acl_cos_tensor);
22702244
22712245 // attn_factor
22722246 if (attn_factor != 1 ) {
@@ -2282,19 +2256,17 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
22822256 } else {
22832257 int64_t num_repeats = 2 ;
22842258 int64_t dim = 3 ;
2285- int64_t output_size = arange_length * num_repeats;
2259+ int64_t output_size = theta_scale_length * num_repeats;
22862260 aclnn_repeat_interleave (ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
22872261 num_repeats, output_size);
22882262 aclnn_repeat_interleave (ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
22892263 num_repeats, output_size);
22902264 }
22912265
22922266 // release
2293- ACL_CHECK (aclDestroyTensor (acl_arange_tensor));
2294- ACL_CHECK (aclDestroyTensor (acl_theta_scale_tensor));
2267+ ACL_CHECK (aclDestroyTensor (acl_theat_scale_tensor));
22952268 ACL_CHECK (aclDestroyTensor (acl_position_tensor));
22962269 ACL_CHECK (aclDestroyTensor (acl_theta_tensor));
2297- ACL_CHECK (aclDestroyTensor (acl_permute_tensor));
22982270 ACL_CHECK (aclDestroyTensor (acl_sin_tensor));
22992271 ACL_CHECK (aclDestroyTensor (acl_cos_tensor));
23002272}
@@ -2353,13 +2325,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23532325
23542326 // init cos/sin cache
23552327 ggml_cann_pool_alloc sin_allocator (
2356- ctx.pool (), src0-> ne [ 0 ] * src0-> ne [ 2 ] * sizeof (float_t ));
2328+ ctx.pool (), ne00 * ne02 * sizeof (float_t ));
23572329 ggml_cann_pool_alloc cos_allocator (
2358- ctx.pool (), src0-> ne [ 0 ] * src0-> ne [ 2 ] * sizeof (float_t ));
2330+ ctx.pool (), ne00 * ne02 * sizeof (float_t ));
23592331 void * sin_buffer = sin_allocator.get ();
23602332 void * cos_buffer = cos_allocator.get ();
23612333
2362- int64_t sin_reshape_ne[4 ] = {src0-> ne [ 0 ] , 1 , src0-> ne [ 2 ] , 1 };
2334+ int64_t sin_reshape_ne[4 ] = {ne00 , 1 , ne02 , 1 };
23632335 size_t sin_reshape_nb[GGML_MAX_DIMS];
23642336 sin_reshape_nb[0 ] = sizeof (float_t );
23652337 for (int i = 1 ; i < GGML_MAX_DIMS; i++) {
0 commit comments