ggml-org
diff --git a/‎ggml/src/ggml-cann/CMakeLists.txt‎
100644100755 b/‎ggml/src/ggml-cann/CMakeLists.txt‎
100644100755
diff --git a/‎ggml/src/ggml-cann/Doxyfile‎
100644100755 b/‎ggml/src/ggml-cann/Doxyfile‎
100644100755
diff --git a/‎ggml/src/ggml-cann/acl_tensor.cpp‎
100644100755
Lines changed: 2 additions & 0 deletions b/‎ggml/src/ggml-cann/acl_tensor.cpp‎
100644100755
Lines changed: 2 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cann/acl_tensor.h‎
100644100755 b/‎ggml/src/ggml-cann/acl_tensor.h‎
100644100755
diff --git a/‎ggml/src/ggml-cann/aclnn_ops.cpp‎
100644100755
Lines changed: 330 additions & 0 deletions b/‎ggml/src/ggml-cann/aclnn_ops.cpp‎
100644100755
Lines changed: 330 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cann/aclnn_ops.h‎
100644100755
Lines changed: 15 additions & 0 deletions b/‎ggml/src/ggml-cann/aclnn_ops.h‎
100644100755
Lines changed: 15 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cann/common.h‎
100644100755 b/‎ggml/src/ggml-cann/common.h‎
100644100755
diff --git a/‎ggml/src/ggml-cann/ggml-cann.cpp‎
100644100755
Lines changed: 36 additions & 0 deletions b/‎ggml/src/ggml-cann/ggml-cann.cpp‎
100644100755
Lines changed: 36 additions & 0 deletions
@@ -31,6 +31,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
             return ACL_FLOAT;
         case GGML_TYPE_F16:
             return ACL_FLOAT16;
+        case GGML_TYPE_BF16:
+            return ACL_BF16;
         case GGML_TYPE_I8:
             return ACL_INT8;
         case GGML_TYPE_I16:
 
@@ -714,6 +714,21 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
  */
 void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
+/**
+ * @brief   Performs the Flash Attention extended operator using the CANN backend.
+ *
+ * @details This function implements the memory-efficient Flash Attention algorithm
+ *          for computing scaled dot-product attention with hardware acceleration.
+ *          The result is stored in the destination tensor `dst`.
+ *
+ *          This operation is accelerated using the CANN backend to improve runtime performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored.
+ *            dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
+ */
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 /*
  * @brief A generic wrapper for ACL resources with custom deleter support.
  */
 
@@ -36,6 +36,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml-cann/aclnn_ops.h"
 #include "ggml-cann/common.h"
+#include "ggml.h"
 
 #define GGML_COMMON_DECL_C
 
@@ -1748,6 +1749,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_COUNT_EQUAL:
             ggml_cann_count_equal(ctx, dst);
             break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            ggml_cann_flash_attn_ext(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -2177,6 +2181,38 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_PAD_REFLECT_1D:
         case GGML_OP_COUNT_EQUAL:
             return true;
+        case GGML_OP_FLASH_ATTN_EXT:{
+            // derived from [ggml-cuda.cu]
+            if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
+                return false;
+            }
+            if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
+                return false;
+            }
+            if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
+                return false;
+            }
+            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                // different head sizes of K and V are not supported yet
+                return false;
+            }
+            if (op->src[0]->ne[0] == 192) {
+                return false;
+            }
+            if (op->src[0]->ne[0] == 576) {
+                // DeepSeek MLA
+                return false;
+            }
+            if (op->src[0]->ne[3] != 1) {
+                return false;
+            }
+            float logitSoftcap = 0.0f;
+            memcpy(&logitSoftcap,  (float*)op->op_params + 2, sizeof(float));
+            if(logitSoftcap != 0.0f) {
+                return false;
+            }
+            return true;
+        }
         default:
             return false;
     }