6464#include  < aclnnop/aclnn_reflection_pad1d.h> 
6565#include  < aclnnop/aclnn_eq_tensor.h> 
6666#include  < aclnnop/aclnn_gt_scalar.h> 
67+ #include  < aclnnop/aclnn_pow.h> 
6768#include  < float.h> 
6869
6970#include  < cmath> 
@@ -2159,69 +2160,60 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
21592160    ggml_tensor* src1 = dst->src [1 ];  //  position
21602161    ggml_tensor* src2 = dst->src [2 ];  //  freq_factors
21612162
2162-     //  arange, [0,1,...,ne0/2]
2163-     int64_t  arange_length = src0->ne [0 ] / 2 ;
2164-     ggml_cann_pool_alloc arange_allocator (ctx.pool (),
2165-                                           arange_length * sizeof (float_t ));
2166-     void * arange_buffer = arange_allocator.get ();
2167-     int64_t  arange_ne[] = {arange_length, 1 , 1 , 1 };
2168-     size_t  arange_nb[] = {sizeof (float_t ), sizeof (float_t ), sizeof (float_t ),
2169-                           arange_length * sizeof (float_t )};
2170- 
2171-     aclTensor* acl_arange_tensor =
2172-         ggml_cann_create_tensor (arange_buffer, ACL_FLOAT, sizeof (float_t ),
2173-                                 arange_ne, arange_nb, GGML_MAX_DIMS);
2174-     float  start = 0 ;
2175-     float  step = 1 ;
2176-     float  stop = src0->ne [0 ] / 2 ;
2177-     float  n_elements = src0->ne [0 ] / 2 ;
2178-     aclnn_arange (ctx, acl_arange_tensor, start, stop, step, n_elements);
2163+     GGML_TENSOR_BINARY_OP_LOCALS
21792164
2180-     //  power
2181-     //  aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
2182-     //  use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
2183-     //  aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
2184-     //  aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
2185-     //  acl_power_tensor);
2165+     //  theta_scale arange, [0,1,...,ne0/2] 
2166+     int64_t  theta_scale_length = ne00 / 2 ;
21862167    ggml_cann_pool_alloc theta_scale_allocator (ctx.pool (),
2187-                                                arange_length  * sizeof (float_t ));
2168+                                           theta_scale_length  * sizeof (float_t ));
21882169    void * theta_scale_buffer = theta_scale_allocator.get ();
2189-     aclTensor* acl_theta_scale_tensor = aclnn_values (
2190-         ctx, theta_scale_buffer, arange_length * sizeof (float_t ), arange_ne,
2191-         GGML_MAX_DIMS, ACL_FLOAT, sizeof (float_t ), theta_scale);
2192-     aclnn_pow_tensor_tensor (ctx, acl_theta_scale_tensor, acl_arange_tensor);
2170+     int64_t  theta_scale_ne[] = {theta_scale_length, 1 , 1 , 1 };
2171+     size_t  theta_scale_nb[] = {sizeof (float_t ), sizeof (float_t ), sizeof (float_t ),
2172+                           theta_scale_length * sizeof (float_t )};
2173+ 
2174+     aclTensor* acl_theat_scale_tensor =
2175+         ggml_cann_create_tensor (theta_scale_buffer, ACL_FLOAT, sizeof (float_t ),
2176+                                 theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2177+     float  start = 0 ;
2178+     float  step = 1 ;
2179+     float  stop = ne00 / 2 ;
2180+     float  n_elements = ne00 / 2 ;
2181+     aclnn_arange (ctx, acl_theat_scale_tensor, start, stop, step, n_elements);
21932182
2183+     //  power
2184+     aclScalar* acl_theta_scale = aclCreateScalar (&theta_scale, aclDataType::ACL_FLOAT);
2185+     GGML_CANN_CALL_ACLNN_OP (PowScalarTensor, acl_theta_scale, acl_theat_scale_tensor, acl_theat_scale_tensor);
2186+     
21942187    //  freq_scale
21952188    if  (freq_scale != 1 ) {
2196-         aclnn_muls (ctx, acl_theta_scale_tensor , freq_scale, nullptr , true );
2189+         aclnn_muls (ctx, acl_theat_scale_tensor , freq_scale, nullptr , true );
21972190    }
21982191
21992192    //  freq_factors
22002193    if  (src2) {
22012194        aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor (
22022195            src2->data , ggml_cann_type_mapping (src2->type ),
2203-             ggml_type_size (src2->type ), arange_ne, arange_nb , GGML_MAX_DIMS);
2204-         aclnn_div (ctx, acl_theta_scale_tensor , acl_freq_factors_tensor);
2196+             ggml_type_size (src2->type ), theta_scale_ne, theta_scale_nb , GGML_MAX_DIMS);
2197+         aclnn_div (ctx, acl_theat_scale_tensor , acl_freq_factors_tensor);
22052198        ACL_CHECK (aclDestroyTensor (acl_freq_factors_tensor));
22062199    }
22072200
22082201    //  position
22092202    GGML_ASSERT (src1->type  == GGML_TYPE_I32);
22102203    int64_t  position_length = src1->ne [0 ];
2211-     int64_t  position_ne[] = {1 , position_length, 1 , 1 };
2212-     size_t  position_nb[] = {sizeof (int32_t ), sizeof (int32_t ),
2213-                             sizeof (int32_t ) * position_length,
2204+     int64_t  position_ne[] = {1 , 1 , position_length, 1 };
2205+     size_t  position_nb[] = {sizeof (int32_t ), sizeof (int32_t ), sizeof (int32_t ),
22142206                            sizeof (int32_t ) * position_length};
22152207    aclTensor* acl_position_tensor = ggml_cann_create_tensor (
22162208        src1->data , ggml_cann_type_mapping (src1->type ),
22172209        ggml_type_size (src1->type ), position_ne, position_nb, GGML_MAX_DIMS);
22182210
22192211    //  power * position
2220-     int64_t  theta_length = arange_length  * position_length;
2212+     int64_t  theta_length = theta_scale_length  * position_length;
22212213    ggml_cann_pool_alloc theta_allocator (ctx.pool (),
22222214                                         theta_length * sizeof (float_t ));
22232215    void * theta_buffer = theta_allocator.get ();
2224-     int64_t  theta_ne[] = {arange_length, position_length,  1 , 1 };
2216+     int64_t  theta_ne[] = {theta_scale_length,  1 , position_length , 1 };
22252217    size_t  theta_nb[GGML_MAX_DIMS];
22262218    theta_nb[0 ] = sizeof (float_t );
22272219    for  (int  i = 1 ; i < GGML_MAX_DIMS; i++) {
@@ -2230,43 +2222,25 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
22302222    aclTensor* acl_theta_tensor =
22312223        ggml_cann_create_tensor (theta_buffer, ACL_FLOAT, sizeof (float_t ),
22322224                                theta_ne, theta_nb, GGML_MAX_DIMS);
2233-     aclnn_mul (ctx, acl_position_tensor, acl_theta_scale_tensor ,
2225+     aclnn_mul (ctx, acl_position_tensor, acl_theat_scale_tensor ,
22342226              acl_theta_tensor);
22352227
2236-     //  permute: [0,1,2,3]->[0,2,1,3]
2237-     int64_t  permute_ne[] = {arange_length, 1 , position_length, 1 };
2238-     size_t  permute_nb[GGML_MAX_DIMS];
2239-     permute_nb[0 ] = sizeof (float_t );
2240-     for  (int  i = 1 ; i < GGML_MAX_DIMS; i++) {
2241-         permute_nb[i] = permute_nb[i - 1 ] * permute_ne[i - 1 ];
2242-     }
2243-     ggml_cann_pool_alloc permute_allocator (ctx.pool (),
2244-                                            theta_length * sizeof (float_t ));
2245-     void * permute_buffer = permute_allocator.get ();
2246-     aclTensor* acl_permute_tensor = ggml_cann_create_tensor (
2247-         permute_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb,
2248-         GGML_MAX_DIMS, ACL_FORMAT_ND);
2249-     int64_t  permute_dim[] = {0 , 2 , 1 , 3 };
2250-     int64_t  num_dims = 4 ;
2251-     aclnn_permute (ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
2252-                   num_dims);
2253- 
22542228    //  sin/cos
22552229    ggml_cann_pool_alloc sin_allocator (ctx.pool (),
22562230                                       theta_length * sizeof (float_t ));
22572231    void * sin_buffer = sin_allocator.get ();
22582232    aclTensor* acl_sin_tensor = ggml_cann_create_tensor (
2259-         sin_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb ,
2233+         sin_buffer, ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb ,
22602234        GGML_MAX_DIMS, ACL_FORMAT_ND);
2261-     aclnn_sin (ctx, acl_permute_tensor , acl_sin_tensor);
2235+     aclnn_sin (ctx, acl_theta_tensor , acl_sin_tensor);
22622236
22632237    ggml_cann_pool_alloc cos_allocator (ctx.pool (),
22642238                                       theta_length * sizeof (float_t ));
22652239    void * cos_buffer = cos_allocator.get ();
22662240    aclTensor* acl_cos_tensor = ggml_cann_create_tensor (
2267-         cos_buffer, ACL_FLOAT, sizeof (float_t ), permute_ne, permute_nb ,
2241+         cos_buffer, ACL_FLOAT, sizeof (float_t ), theta_ne, theta_nb ,
22682242        GGML_MAX_DIMS, ACL_FORMAT_ND);
2269-     aclnn_cos (ctx, acl_permute_tensor , acl_cos_tensor);
2243+     aclnn_cos (ctx, acl_theta_tensor , acl_cos_tensor);
22702244
22712245    //  attn_factor
22722246    if  (attn_factor != 1 ) {
@@ -2282,19 +2256,17 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
22822256    } else  {
22832257        int64_t  num_repeats = 2 ;
22842258        int64_t  dim = 3 ;
2285-         int64_t  output_size = arange_length  * num_repeats;
2259+         int64_t  output_size = theta_scale_length  * num_repeats;
22862260        aclnn_repeat_interleave (ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
22872261                                num_repeats, output_size);
22882262        aclnn_repeat_interleave (ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
22892263                                num_repeats, output_size);
22902264    }
22912265
22922266    //  release
2293-     ACL_CHECK (aclDestroyTensor (acl_arange_tensor));
2294-     ACL_CHECK (aclDestroyTensor (acl_theta_scale_tensor));
2267+     ACL_CHECK (aclDestroyTensor (acl_theat_scale_tensor));
22952268    ACL_CHECK (aclDestroyTensor (acl_position_tensor));
22962269    ACL_CHECK (aclDestroyTensor (acl_theta_tensor));
2297-     ACL_CHECK (aclDestroyTensor (acl_permute_tensor));
22982270    ACL_CHECK (aclDestroyTensor (acl_sin_tensor));
22992271    ACL_CHECK (aclDestroyTensor (acl_cos_tensor));
23002272}
@@ -2353,13 +2325,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23532325
23542326    //  init cos/sin cache
23552327    ggml_cann_pool_alloc sin_allocator (
2356-         ctx.pool (), src0-> ne [ 0 ]  * src0-> ne [ 2 ]  * sizeof (float_t ));
2328+         ctx.pool (), ne00  * ne02  * sizeof (float_t ));
23572329    ggml_cann_pool_alloc cos_allocator (
2358-         ctx.pool (), src0-> ne [ 0 ]  * src0-> ne [ 2 ]  * sizeof (float_t ));
2330+         ctx.pool (), ne00  * ne02  * sizeof (float_t ));
23592331    void * sin_buffer = sin_allocator.get ();
23602332    void * cos_buffer = cos_allocator.get ();
23612333
2362-     int64_t  sin_reshape_ne[4 ] = {src0-> ne [ 0 ] , 1 , src0-> ne [ 2 ] , 1 };
2334+     int64_t  sin_reshape_ne[4 ] = {ne00 , 1 , ne02 , 1 };
23632335    size_t  sin_reshape_nb[GGML_MAX_DIMS];
23642336    sin_reshape_nb[0 ] = sizeof (float_t );
23652337    for  (int  i = 1 ; i < GGML_MAX_DIMS; i++) {
0 commit comments