7070#include < aclnnop/aclnn_zero.h>
7171#include < aclnnop/aclnn_index_copy.h>
7272#include < aclnnop/aclnn_index_select.h>
73+ #include < aclnnop/aclnn_clamp.h>
74+ #include < aclnnop/aclnn_threshold.h>
7375#include < float.h>
7476
7577#include < cmath>
@@ -1423,21 +1425,25 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
14231425 * @param start Starting exponent offset.
14241426 * @param stop Stopping exponent offset (exclusive).
14251427 * @param step Step size for the exponent increment.
1428+ * @param dtype Data type for slope tensor.
14261429 */
14271430static void aclnn_get_slope_inner (ggml_backend_cann_context& ctx, void * slope_buffer,
1428- float m, int64_t size, float start, float stop, float step){
1431+ float m, int64_t size, float start, float stop, float step, ggml_type dtype){
1432+ aclDataType acl_type = ggml_cann_type_mapping (dtype);
1433+ size_t type_size = ggml_type_size (dtype);
1434+
14291435 int64_t ne[] = {size};
1430- size_t nb[] = {sizeof ( uint16_t ) };
1436+ size_t nb[] = {type_size };
14311437
1432- ggml_cann_pool_alloc arange_allocator (ctx.pool (), size * sizeof ( uint16_t ) );
1438+ ggml_cann_pool_alloc arange_allocator (ctx.pool (), size * type_size );
14331439 void * arange_buffer = arange_allocator.get ();
14341440
14351441 aclTensor* arange_tensor = ggml_cann_create_tensor (
1436- arange_buffer, ACL_FLOAT16, sizeof ( uint16_t ) , ne, nb, 1 );
1442+ arange_buffer, acl_type, type_size , ne, nb, 1 );
14371443 aclnn_arange (ctx, arange_tensor, start, stop, step, size);
14381444
14391445 aclTensor* slope_tensor = ggml_cann_create_tensor (
1440- slope_buffer, ACL_FLOAT16, sizeof ( uint16_t ) , ne, nb, 1 );
1446+ slope_buffer, acl_type, type_size , ne, nb, 1 );
14411447
14421448 aclScalar* sc = aclCreateScalar (&m, aclDataType::ACL_FLOAT);
14431449
@@ -1468,10 +1474,11 @@ static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_bu
14681474 * @param n_head Total number of attention heads.
14691475 * @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
14701476 * @param max_bias Maximum bias value for slope computation.
1477+ * @param dtype Data type for slope tensor.
14711478 *
14721479*/
14731480static void aclnn_get_slope (ggml_backend_cann_context & ctx, int64_t n_head,
1474- void * slope_buffer, float max_bias) {
1481+ void * slope_buffer, float max_bias, ggml_type dtype ) {
14751482 const int n_head_log2 = 1u << (uint32_t ) floor (log2 (n_head));
14761483
14771484 float m0 = powf (2 .0f , -(max_bias) / n_head_log2);
@@ -1488,7 +1495,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
14881495 float step = 1 ;
14891496 float count = n_head_log2;
14901497 // end needs to be +1 because aclnn uses a left-closed, right-open interval.
1491- aclnn_get_slope_inner (ctx, slope_buffer, m0, count, start, end + 1 , step);
1498+ aclnn_get_slope_inner (ctx, slope_buffer, m0, count, start, end + 1 , step, dtype );
14921499 if (n_head_log2 < n_head) {
14931500 // arange2
14941501 start = 2 * (n_head_log2 - n_head_log2) + 1 ;
@@ -1497,7 +1504,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
14971504 count = n_head - n_head_log2;
14981505 aclnn_get_slope_inner (
14991506 ctx, (char *) slope_buffer + n_head_log2 * sizeof (float ),
1500- m1, count, start, end + 1 , step);
1507+ m1, count, start, end + 1 , step, dtype );
15011508 }
15021509}
15031510
@@ -1534,7 +1541,7 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
15341541 ggml_cann_pool_alloc bias_allocator (
15351542 ctx.pool (), ggml_nelements (dst) * ggml_element_size (dst));
15361543 bias_buffer = bias_allocator.get ();
1537- aclnn_get_slope (ctx, n_heads, slope_buffer, max_bias);
1544+ aclnn_get_slope (ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32 );
15381545 }
15391546
15401547 // broadcast for mask, slop and dst;
@@ -2263,6 +2270,7 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
22632270 */
22642271static void aclnn_cache_init (ggml_backend_cann_context& ctx, ggml_tensor* dst,
22652272 void * sin_tensor_buffer, void * cos_tensor_buffer,
2273+ float * corr_dims, float ext_factor,
22662274 float theta_scale, float freq_scale,
22672275 float attn_factor, bool is_neox) {
22682276 // int sin/cos cache, cache has different repeat method depond on
@@ -2318,16 +2326,60 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
23182326 float n_elements = theta_scale_length;
23192327 aclnn_arange (ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
23202328
2329+ ggml_cann_pool_alloc yarn_ramp_allocator (ctx.pool ());
2330+ aclTensor* acl_yarn_ramp_tensor = nullptr ;
2331+ if (ext_factor != 0 ) {
2332+ // -rope_yarn_ramp
2333+ // const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
2334+ // return MIN(1, MAX(0, y)) - 1;
2335+ yarn_ramp_allocator.alloc (theta_scale_length * sizeof (float ));
2336+ void * yarn_ramp_buffer = yarn_ramp_allocator.get ();
2337+ acl_yarn_ramp_tensor = ggml_cann_create_tensor (yarn_ramp_buffer, ACL_FLOAT, sizeof (float_t ),
2338+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2339+ float zero_value = 0 , one_value = 1 ;
2340+ float denom_safe_value = MAX (0 .001f , corr_dims[1 ] - corr_dims[0 ]);
2341+ aclScalar* low = aclCreateScalar (&corr_dims[0 ], aclDataType::ACL_FLOAT);
2342+ aclScalar* zero = aclCreateScalar (&zero_value, aclDataType::ACL_FLOAT);
2343+ aclScalar* one = aclCreateScalar (&one_value, aclDataType::ACL_FLOAT);
2344+ aclScalar* denom_safe = aclCreateScalar (&denom_safe_value, aclDataType::ACL_FLOAT);
2345+ aclScalar* ext_factor_sc = aclCreateScalar (&ext_factor, aclDataType::ACL_FLOAT);
2346+
2347+ GGML_CANN_CALL_ACLNN_OP (ctx, Subs, acl_theta_scale_tensor, low, one, acl_yarn_ramp_tensor);
2348+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceDivs, acl_yarn_ramp_tensor, denom_safe);
2349+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceThreshold, acl_yarn_ramp_tensor, zero, zero);
2350+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceClampMax, acl_yarn_ramp_tensor, one);
2351+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceSubs, acl_yarn_ramp_tensor, one, one);
2352+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceMuls, acl_yarn_ramp_tensor, ext_factor_sc);
2353+
2354+ // theta_interp = freq_scale * theta_extrap;
2355+ // theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
2356+ // theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
2357+ // theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
2358+ // theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
2359+ //
2360+ // we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
2361+ // cache freq_scale + (freq_scale - 1) * ramp_mix
2362+ float freq_scale_1 = freq_scale - 1 ;
2363+ aclScalar* freq_scale_sc = aclCreateScalar (&freq_scale, aclDataType::ACL_FLOAT);
2364+ aclScalar* freq_scale_1_sc = aclCreateScalar (&freq_scale_1, aclDataType::ACL_FLOAT);
2365+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceMuls, acl_yarn_ramp_tensor, freq_scale_1_sc);
2366+ GGML_CANN_CALL_ACLNN_OP (ctx, InplaceAdds, acl_yarn_ramp_tensor, freq_scale_sc, one);
2367+
2368+ ggml_cann_release_resources (ctx, low, zero, one, denom_safe, ext_factor_sc, freq_scale_sc, freq_scale_1_sc);
2369+ }
2370+
23212371 // power
23222372 aclScalar* acl_theta_scale = aclCreateScalar (&theta_scale, aclDataType::ACL_FLOAT);
23232373 GGML_CANN_CALL_ACLNN_OP (ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
23242374 acl_theta_scale_tensor);
23252375
2326- // freq_scale
2327- if (freq_scale != 1 ) {
2376+ if (ext_factor != 0 ) {
2377+ aclnn_mul (ctx, acl_theta_scale_tensor, acl_yarn_ramp_tensor);
2378+ } else if (freq_scale != 1 ) {
23282379 aclnn_muls (ctx, acl_theta_scale_tensor, freq_scale, nullptr , true );
23292380 }
2330- ggml_cann_release_resources (ctx, acl_theta_scale);
2381+
2382+ ggml_cann_release_resources (ctx, acl_yarn_ramp_tensor, acl_theta_scale);
23312383 } else {
23322384 // use cache
23332385 acl_theta_scale_tensor =
@@ -2385,6 +2437,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
23852437 GGML_MAX_DIMS, ACL_FORMAT_ND);
23862438 aclnn_cos (ctx, acl_theta_tensor, acl_cos_tensor);
23872439
2440+ if (ext_factor != 0 ) {
2441+ attn_factor *= 1 .0f + 0 .1f * logf (1 .0f / freq_scale);
2442+ }
2443+
23882444 // attn_factor
23892445 if (attn_factor != 1 ) {
23902446 aclnn_muls (ctx, acl_sin_tensor, attn_factor, nullptr , true );
@@ -2465,8 +2521,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
24652521 // TODO: n_dims <= ne0
24662522 GGML_ASSERT (n_dims == ne0);
24672523 GGML_ASSERT (n_dims % 2 == 0 );
2468- // TODO: ext_factor != 0
2469- GGML_ASSERT (ext_factor == 0 );
24702524
24712525 const float theta_scale = powf (freq_base, -2 .0f / n_dims);
24722526
@@ -2484,7 +2538,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
24842538 void *cos_tensor_buffer = cos_tensor_allocator.get ();
24852539
24862540 // init ctx.rope_cos/rope_sin cache
2487- aclnn_cache_init (ctx, dst, sin_tensor_buffer, cos_tensor_buffer,
2541+ aclnn_cache_init (ctx, dst, sin_tensor_buffer, cos_tensor_buffer, corr_dims, ext_factor,
24882542 theta_scale, freq_scale, attn_factor, is_neox);
24892543
24902544 int64_t sin_reshape_ne[4 ] = {ne00, 1 , ne02, 1 };
@@ -3220,7 +3274,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
32203274 const int64_t n_heads = src0->ne [2 ];
32213275 ggml_cann_pool_alloc slope_allocator (ctx.pool (), n_heads * sizeof (uint16_t ));
32223276 void * slope_buffer = slope_allocator.get ();
3223- aclnn_get_slope (ctx, n_heads, slope_buffer, maxBias);
3277+ aclnn_get_slope (ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16 );
32243278
32253279 int64_t slope_ne[] = {1 , 1 , n_heads, 1 };
32263280 size_t slope_nb[GGML_MAX_DIMS];
0 commit comments