cann: add the basic FA support

shibizhao · shibizhao · commit 72df31dfc215 · 2025-05-14T22:40:53.000+08:00
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2587,3 +2587,163 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
 
     ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
 }
+
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    
+    ggml_tensor* src0 = dst->src[0]; // q, fp32
+    ggml_tensor* src1 = dst->src[1]; // k, fp16
+    ggml_tensor* src2 = dst->src[2]; // v, fp16
+    ggml_tensor* src3 = dst->src[3]; // mask, fp16
+
+    size_t faElemSize = sizeof(uint16_t);
+    
+    // Step 1: cast the src0 (Query) to fp16
+    aclTensor* acl_src0_f16_tensor = nullptr;
+    
+    ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
+    void* src0_f16_buffer = nullptr;
+
+    if(src0->type != GGML_TYPE_F16){
+        aclTensor* acl_src0_f32_tensor = ggml_cann_create_tensor(src0);
+
+        src0_f16_allocator.alloc(ggml_nelements(src0) * faElemSize);
+        src0_f16_buffer = src0_f16_allocator.get();
+
+        int64_t* src0_f16_ne = src0->ne;
+        size_t   src0_f16_nb[GGML_MAX_DIMS];        
+        src0_f16_nb[0] = sizeof(uint16_t);
+        for(int i = 1; i < GGML_MAX_DIMS; ++i){
+            src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
+        }
+
+        acl_src0_f16_tensor = ggml_cann_create_tensor(
+            src0_f16_buffer, ACL_FLOAT16, faElemSize, 
+            src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS
+        );
+        aclnn_cast(ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, ACL_FLOAT16);
+        ggml_cann_release_resources(ctx, acl_src0_f32_tensor);
+    }else{
+        acl_src0_f16_tensor = ggml_cann_create_tensor(src0);
+    }
+
+    // Step 2: genetates mask with ACL_BOOL
+    size_t maskElemSize = sizeof(char);
+    ggml_cann_pool_alloc src3_bool_allocator(ctx.pool());
+    src3_bool_allocator.alloc(ggml_nelements(src3) * maskElemSize);
+    void* src3_bool_buffer = src3_bool_allocator.get();
+    
+    int64_t* src3_bool_ne = src3->ne;
+    size_t   src3_bool_nb[GGML_MAX_DIMS];
+    src3_bool_nb[0] = maskElemSize;
+    for(int i = 1; i < GGML_MAX_DIMS; ++i){
+        src3_bool_nb[i] = src3_bool_nb[i - 1] * src3_bool_ne[i - 1];
+    }
+
+    aclTensor* acl_mask_f16_tensor = ggml_cann_create_tensor(src3);
+    aclTensor* acl_mask_bool_tensor = ggml_cann_create_tensor(
+        src3_bool_buffer, ACL_BOOL, maskElemSize, 
+        src3_bool_ne, src3_bool_nb, GGML_MAX_DIMS);
+
+    GGML_CANN_CALL_ACLNN_OP(ctx, IsNegInf, acl_mask_f16_tensor, acl_mask_bool_tensor);
+    ggml_cann_release_resources(ctx, acl_mask_f16_tensor);
+
+    // Step 3: generates the output tensor directly from FA kernel
+    ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
+    out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
+    void* out_f16_buffer = out_f16_allocator.get();
+
+    int64_t* out_f16_ne = src0->ne;
+    size_t out_f16_nb[GGML_MAX_DIMS];
+    out_f16_nb[0] = faElemSize;
+    for(int i = 1; i < GGML_MAX_DIMS; ++i){
+        out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
+    }
+
+    aclTensor* acl_out_f16_tensor = ggml_cann_create_tensor(
+        out_f16_buffer, ACL_FLOAT16, faElemSize, 
+        out_f16_ne, out_f16_nb, GGML_MAX_DIMS
+    );
+
+    // Step 4: Performs the f16 Flash Attention kernel
+
+    int kvTensorNum = 1;
+    aclTensor* acl_q_tensor = acl_src0_f16_tensor;
+    aclTensor* acl_k_tensors[] = {ggml_cann_create_tensor(src1)};
+    aclTensor* acl_v_tensors[] = {ggml_cann_create_tensor(src2)};
+    auto acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum);
+    auto acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
+    aclTensor* acl_out_tensor = acl_out_f16_tensor;
+
+
+    int64_t numHeads = src0->ne[2]; // N
+    int64_t numKeyValueHeads = src1->ne[2];
+    double  scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
+    int64_t preTokens = 65535;
+    int64_t nextTokens = 65535;
+    char layout[5] = {'B', 'N', 'S', 'D', 0};
+    int64_t sparseMode = 0;
+    int64_t innerPrecise = 1;
+    int64_t blockSize = 0;
+    int64_t antiquantMode = 0;
+    bool softmaxLseFlag = false;
+    int64_t keyAntiquantMode = 0;
+    int64_t valueAntiquantMode = 0;
+
+    // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
+
+    GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
+        acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
+        nullptr, acl_mask_bool_tensor, // pse, mask
+        nullptr, nullptr, // actSeqLen, actSeqLenkv
+        nullptr, nullptr, // deqScale1, quantScale1
+        nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
+        nullptr, nullptr, // antiquantScale, antiquantOffset
+        nullptr, // blockTable
+        nullptr, nullptr, // qPadSize, kvPadSize
+        nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
+        nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
+        nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
+        numHeads, scaleValue, // heads, scaleValue
+        preTokens, nextTokens, // preTokens, nextTokens
+        layout, // inputLayout
+        numKeyValueHeads, // numKVHeads
+        sparseMode, innerPrecise, // sparseMode, innerPrecise
+        blockSize, antiquantMode, // blockSize, antiquantMode
+        softmaxLseFlag, // softmaxLseFlag
+        keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
+        acl_out_tensor, // attentionOut
+        nullptr // softmaxLse
+    );
+
+    // Step 5: post-processing, permute and cast to f32
+    int64_t new_dim[] = {0, 2, 1, 3};
+    aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+
+    if(dst->type != GGML_TYPE_F16){
+        ggml_cann_pool_alloc perm_out_f16_allocator(ctx.pool());
+        perm_out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
+        void* perm_out_f16_buffer = perm_out_f16_allocator.get();
+
+        int64_t* perm_out_f16_ne = dst->ne;
+        size_t  perm_out_f16_nb[GGML_MAX_DIMS];
+        perm_out_f16_nb[0] = faElemSize;
+        for(int i = 1; i < GGML_MAX_DIMS; ++i){
+            perm_out_f16_nb[i] = perm_out_f16_nb[i - 1] * perm_out_f16_ne[i - 1];
+        }
+        aclTensor* acl_perm_out_f16_tensor = ggml_cann_create_tensor(
+            perm_out_f16_buffer, ACL_FLOAT16, faElemSize, 
+            perm_out_f16_ne, perm_out_f16_nb, GGML_MAX_DIMS);
+        aclnn_permute(ctx, acl_out_tensor, acl_perm_out_f16_tensor, new_dim, GGML_MAX_DIMS);
+        aclnn_cast(ctx, 
+            acl_perm_out_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
+        ggml_cann_release_resources(ctx, acl_perm_out_f16_tensor);
+    }else{
+        // only need to permute
+        aclnn_permute(ctx, acl_out_tensor, acl_dst_tensor, new_dim, GGML_MAX_DIMS);
+    }
+
+    ggml_cann_release_resources(ctx, acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, 
+                                acl_mask_bool_tensor, acl_out_f16_tensor, 
+                                acl_dst_tensor);
+
+}
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
@@ -45,6 +45,8 @@
 #include <aclnnop/aclnn_cos.h>
 #include <aclnnop/aclnn_log.h>
 #include <aclnnop/aclnn_sign.h>
+#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
+#include <aclnnop/aclnn_isneginf.h>
 #include "acl_tensor.h"
 #include "common.h"
 
@@ -714,6 +716,21 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
  */
 void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
+/**
+ * @brief   Performs the Flash Attention extended operator using the CANN backend.
+ *
+ * @details This function implements the memory-efficient Flash Attention algorithm
+ *          for computing scaled dot-product attention with hardware acceleration.
+ *          The result is stored in the destination tensor `dst`.
+ *          
+ *          This operation is accelerated using the CANN backend to improve runtime performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored.
+ *            dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
+ */
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 /*
  * @brief A generic wrapper for ACL resources with custom deleter support.
  */
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1747,6 +1747,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_COUNT_EQUAL:
             ggml_cann_count_equal(ctx, dst);
             break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            ggml_cann_flash_attn_ext(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -2161,6 +2164,36 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_PAD_REFLECT_1D:
         case GGML_OP_COUNT_EQUAL:
             return true;
+        case GGML_OP_FLASH_ATTN_EXT:{
+            // copy from [ggml-cuda.cu]
+            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                // different head sizes of K and V are not supported yet
+                return false;
+            }
+            if (op->src[0]->ne[0] == 192) {
+                return false;
+            }
+            if (op->src[0]->ne[0] == 576) {
+                // DeepSeek MLA
+                return false;
+            }
+            if (op->src[0]->ne[3] != 1) {
+                return false;
+            }
+            if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
+                return false;
+            }
+            if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
+                return true;
+            }
+            if (op->src[0]->ne[0] == 128) {
+                return true;
+            }
+            if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
+                return true;
+            }
+            return op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
+        }
         default:
             return false;
     }