@@ -2898,14 +2898,14 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
28982898 ggml_nelements (src0) * faElemSize);
28992899
29002900 int64_t * src0_f16_ne = src0->ne ;
2901- size_t src0_f16_nb[GGML_MAX_DIMS];
2901+ size_t src0_f16_nb[GGML_MAX_DIMS];
29022902 src0_f16_nb[0 ] = sizeof (uint16_t );
29032903 for (int i = 1 ; i < GGML_MAX_DIMS; ++i){
29042904 src0_f16_nb[i] = src0_f16_nb[i - 1 ] * src0_f16_ne[i - 1 ];
29052905 }
29062906
29072907 acl_src0_f16_tensor = ggml_cann_create_tensor (
2908- src0_f16_buffer, faDataType, faElemSize,
2908+ src0_f16_buffer, faDataType, faElemSize,
29092909 src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS
29102910 );
29112911 aclnn_cast (ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, faDataType);
@@ -2914,7 +2914,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
29142914 acl_src0_f16_tensor = ggml_cann_create_tensor (src0);
29152915 }
29162916
2917- // Step 2: create the acl tensors for src1 (Key), src2 (Value),
2917+ // Step 2: create the acl tensors for src1 (Key), src2 (Value),
29182918 // and the direct output from FusedInferAttention
29192919
29202920 acl_src1_f16_tensor = ggml_cann_create_tensor (src1);
@@ -2932,24 +2932,23 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
29322932 }
29332933
29342934 acl_dst_f16_tensor = ggml_cann_create_tensor (
2935- out_f16_buffer, faDataType, faElemSize,
2935+ out_f16_buffer, faDataType, faElemSize,
29362936 out_f16_ne, out_f16_nb, GGML_MAX_DIMS
29372937 );
29382938
2939-
29402939 // Step 3: create the PSEShift tensor if needed
29412940 // this tensor is considered as mask (f16) in the llama.cpp
2942-
2941+
29432942 aclTensor* bcast_pse_tensor = nullptr ;
29442943 int64_t bcast_pse_ne[GGML_MAX_DIMS];
29452944 size_t bcast_pse_nb[GGML_MAX_DIMS];
29462945 ggml_cann_pool_alloc bcast_pse_allocator (ctx.pool ());
29472946 void * bcast_pse_buffer = nullptr ;
2948-
2947+
29492948 if (src3 != nullptr ){
29502949 bcast_pse_buffer = bcast_pse_allocator.alloc (
29512950 ggml_nelements (src3) * src0->ne [2 ] * sizeof (uint16_t ));
2952-
2951+
29532952 if (src0->ne [1 ] > 1 ){
29542953 // Case 1: broadcast pse for prefill stage with multiple head
29552954 aclTensor* acl_mask_f16_tensor = ggml_cann_create_tensor (src3);
@@ -2964,7 +2963,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
29642963 }
29652964
29662965 bcast_pse_tensor = ggml_cann_create_tensor (
2967- bcast_pse_buffer, ACL_FLOAT16, sizeof (uint16_t ),
2966+ bcast_pse_buffer, ACL_FLOAT16, sizeof (uint16_t ),
29682967 bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
29692968
29702969 int64_t repeats[] = {1 , src0->ne [2 ], 1 , 1 };
@@ -2977,7 +2976,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
29772976 size_t * trunc_pse_nb = src3->nb ;
29782977
29792978 aclTensor* acl_mask_f16_trunc_tensor = ggml_cann_create_tensor (
2980- src3->data , ACL_FLOAT16, sizeof (uint16_t ),
2979+ src3->data , ACL_FLOAT16, sizeof (uint16_t ),
29812980 trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
29822981
29832982 bcast_pse_ne[0 ] = src3->ne [0 ];
@@ -2991,7 +2990,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
29912990 }
29922991
29932992 bcast_pse_tensor = ggml_cann_create_tensor (
2994- bcast_pse_buffer, ACL_FLOAT16, sizeof (uint16_t ),
2993+ bcast_pse_buffer, ACL_FLOAT16, sizeof (uint16_t ),
29952994 bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
29962995
29972996 int64_t repeats[] = {1 , src0->ne [2 ], 1 , 1 };
@@ -3007,8 +3006,8 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
30073006 const int64_t n_head = src0->ne [2 ];
30083007 const int n_heads_log2_floor = 1u << (uint32_t )floor (log2 (n_head));
30093008 float m0 = powf (2 .0f , -(maxBias) / n_heads_log2_floor);
3010- float m1 = powf (2 .0f , -(maxBias / 2 .0f ) / n_heads_log2_floor);
3011- // init arange
3009+ float m1 = powf (2 .0f , -(maxBias / 2 .0f ) / n_heads_log2_floor);
3010+ // init arange
30123011 ggml_cann_pool_alloc arange_allocator (ctx.pool (),
30133012 ne2_ne3 * faElemSize);
30143013 void * tmp_arange_buffer = arange_allocator.get ();
@@ -3076,11 +3075,11 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
30763075 int64_t tmp_mk_base_ne[] = {ne2_ne3};
30773076 size_t tmp_mk_base_nb[] = {faElemSize};
30783077 aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor (
3079- tmp_mk_base_buffer, faDataType, faElemSize,
3078+ tmp_mk_base_buffer, faDataType, faElemSize,
30803079 tmp_mk_base_ne, tmp_mk_base_nb,
30813080 GGML_MAX_DIMS - 3 , ACL_FORMAT_ND);
30823081 aclTensor* tmp_arange_tensor = ggml_cann_create_tensor (
3083- tmp_arange_buffer, faDataType, faElemSize,
3082+ tmp_arange_buffer, faDataType, faElemSize,
30843083 tmp_mk_base_ne, tmp_mk_base_nb,
30853084 GGML_MAX_DIMS - 3 , ACL_FORMAT_ND);
30863085 aclnn_pow_tensor_tensor (ctx, tmp_mk_base_tensor, tmp_arange_tensor);
@@ -3095,12 +3094,12 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
30953094 aclTensor* tmp_mk_tensor = ggml_cann_create_tensor (
30963095 tmp_mk_base_buffer, faDataType, faElemSize,
30973096 tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
3098- ACL_FORMAT_ND);
3097+ ACL_FORMAT_ND);
30993098 GGML_CANN_CALL_ACLNN_OP (ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
31003099
31013100 ggml_cann_release_resources (ctx, tmp_arange1_tensor, tmp_arange2_tensor,
31023101 tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
3103- tmp_arange_tensor, tmp_mk_tensor);
3102+ tmp_arange_tensor, tmp_mk_tensor);
31043103 }
31053104 }
31063105
@@ -3128,7 +3127,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
31283127
31293128 // Step 5: launch the FusedInferAttentionScoreV2 kernel.
31303129 // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
3131-
3130+
31323131 GGML_CANN_CALL_ACLNN_OP (ctx, FusedInferAttentionScoreV2,
31333132 acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
31343133 bcast_pse_tensor, nullptr , // pse, mask
@@ -3170,20 +3169,20 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
31703169 perm_out_f16_nb[i] = perm_out_f16_nb[i - 1 ] * perm_out_f16_ne[i - 1 ];
31713170 }
31723171 aclTensor* acl_perm_out_f16_tensor = ggml_cann_create_tensor (
3173- perm_out_f16_buffer, faDataType, faElemSize,
3172+ perm_out_f16_buffer, faDataType, faElemSize,
31743173 perm_out_f16_ne, perm_out_f16_nb, GGML_MAX_DIMS);
31753174 aclnn_permute (ctx, acl_dst_f16_tensor, acl_perm_out_f16_tensor, new_dim, GGML_MAX_DIMS);
3176- aclnn_cast (ctx,
3175+ aclnn_cast (ctx,
31773176 acl_perm_out_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping (dst->type ));
31783177 ggml_cann_release_resources (ctx, acl_perm_out_f16_tensor);
31793178 }else {
31803179 // only need to permute
31813180 aclnn_permute (ctx, acl_dst_f16_tensor, acl_dst_tensor, new_dim, GGML_MAX_DIMS);
31823181 }
3183- ggml_cann_release_resources (ctx, acl_src0_f16_tensor,
3184- acl_src1_f16_tensor,
3185- acl_src2_f16_tensor,
3186- acl_dst_f16_tensor,
3182+ ggml_cann_release_resources (ctx, acl_src0_f16_tensor,
3183+ acl_src1_f16_tensor,
3184+ acl_src2_f16_tensor,
3185+ acl_dst_f16_tensor,
31873186 acl_dst_tensor);
31883187 if (src3 != nullptr ){
31893188 ggml_cann_release_resources (ctx, bcast_pse_tensor);
0 commit comments