From 004f090757be2cba3c6259c0f94b7999e895030c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B5=B5=E7=A6=B9=E6=98=87?=
 <2501112001@cninfer02.localdomain>
Date: Fri, 31 Oct 2025 16:50:06 +0800
Subject: [PATCH 1/3] support gated linear attn

---
 ggml/src/ggml-cann/aclnn_ops.cpp | 88 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-cann/aclnn_ops.h   | 16 ++++++
 ggml/src/ggml-cann/ggml-cann.cpp |  4 ++
 3 files changed, 108 insertions(+)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index bc33b99d96e..2a359012f33 100755
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -38,6 +38,7 @@
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_max_pool.h>
 #include <aclnnop/aclnn_mm.h>
+#include <aclnnop/aclnn_mv.h>
 #include <aclnnop/aclnn_permute.h>
 #include <aclnnop/aclnn_pow_tensor_tensor.h>
 #include <aclnnop/aclnn_reduce_sum.h>
@@ -439,6 +440,93 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_cann_release_resources(ctx, norm, acl_src, acl_dst);
 }
 
+void ggml_cann_gated_linear_attn(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor * k  = dst->src[0];
+    ggml_tensor * v  = dst->src[1];
+    ggml_tensor * q  = dst->src[2];
+    ggml_tensor * g = dst->src[3];
+    ggml_tensor * s  = dst->src[4];
+
+    int64_t B = dst->src[4]->ne[1];
+    int64_t T = dst->src[0]->ne[2];
+    int64_t H = dst->src[0]->ne[1];
+    int64_t C = dst->ne[0];
+    int64_t D = C / H;
+    int64_t L = T / B;
+
+    int64_t ne_qkg[2] = {1, D};
+    // int64_t ne_qkg[2] = {D, 1};
+    int64_t ne_s[2] = {D, D};
+    int64_t ne_vo[2] = {D, 1};
+    // int64_t ne_vo[2] = {1, D};
+    int64_t ne_q[1] = {D};
+    size_t nb_base = ggml_type_size(k->type);
+    size_t nb_qkg[2] = {nb_base, nb_base};
+    size_t nb_s[2] = {nb_base, D * nb_base};
+    size_t nb_vo[2] = {nb_base, D * nb_base};
+    size_t nb_q[1] = {nb_base};
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    for (int64_t b = 0; b < B; b++) {
+        for (int64_t h = 0; h < H; h++) {
+            size_t s_offset = (b * (H * D * D) + h * (D * D)) * nb_base;
+            // D * D
+            aclTensor* acl_s = ggml_cann_create_tensor(s, ne_s, nb_s, 2, ACL_FORMAT_ND, s_offset);
+            aclTensor* acl_s_new = ggml_cann_create_tensor(dst, ne_s, nb_s, 2, ACL_FORMAT_ND, (B * L * H * D) * nb_base + s_offset);
+            cann_copy(ctx, acl_s, acl_s_new);
+            for (int64_t l = 0; l < L; l++) {
+                size_t qkvgo_offset = (b * (L * H * D) + l * (H * D) + h * (D)) * nb_base;
+                // D * 1
+                aclTensor* acl_k = ggml_cann_create_tensor(k, ne_qkg, nb_qkg, 2, ACL_FORMAT_ND, qkvgo_offset);
+                aclTensor* acl_g = ggml_cann_create_tensor(g, ne_qkg, nb_qkg, 2, ACL_FORMAT_ND, qkvgo_offset);
+                // D
+                aclTensor* acl_q = ggml_cann_create_tensor(q, ne_q, nb_q, 1, ACL_FORMAT_ND, qkvgo_offset);
+                // 1 * D
+                aclTensor* acl_v = ggml_cann_create_tensor(v, ne_vo, nb_vo, 2, ACL_FORMAT_ND, qkvgo_offset);
+                // D
+                aclTensor* acl_o = ggml_cann_create_tensor(dst, ne_q, nb_q, 1, ACL_FORMAT_ND, qkvgo_offset);
+                // repeat k and v
+                // buffer for repeated k
+                size_t buf_size = D * D * sizeof(float);
+                ggml_cann_pool_alloc state_buf1(ctx.pool(), buf_size);
+                void* buf1_ptr = state_buf1.get();
+                aclTensor* acl_buf_k = ggml_cann_create_tensor(buf1_ptr, ggml_cann_type_mapping(k->type), ggml_type_size(k->type), ne_s, nb_s, 2);
+                // buffer for repeated v
+                ggml_cann_pool_alloc state_buf2(ctx.pool(), buf_size);
+                void* buf2_ptr = state_buf2.get();
+                aclTensor* acl_buf_v = ggml_cann_create_tensor(buf2_ptr, ggml_cann_type_mapping(k->type), ggml_type_size(k->type), ne_s, nb_s, 2);
+                // repeat
+                int64_t k_rep[2] = {1, D};
+                int64_t v_rep[2] = {D, 1};
+                // int64_t k_rep[2] = {D, 1};
+                // int64_t v_rep[2] = {1, D};
+                aclIntArray* acl_k_rep = aclCreateIntArray(k_rep, 2);
+                aclIntArray* acl_v_rep = aclCreateIntArray(v_rep, 2);
+                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_k, acl_k_rep, acl_buf_k);
+                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_v, acl_v_rep, acl_buf_v);
+                // inplace mul, saved in acl_buf_k
+                aclnn_mul(ctx, acl_buf_k, acl_buf_v, nullptr);
+                // apply g to s
+                // reuse acl_buf_v to store repeated g
+                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_g, acl_k_rep, acl_buf_v);
+                aclnn_mul(ctx, acl_s_new, acl_buf_v, nullptr);
+                // add kv
+                aclnn_add(ctx, acl_s_new, acl_buf_k, nullptr);
+                // compute output
+                // permute state and store in acl_buf k
+                int64_t newdim[2] = {1, 0};
+                aclnn_permute(ctx, acl_s_new, acl_buf_k, newdim, 2);
+                GGML_CANN_CALL_ACLNN_OP(ctx, Mv, acl_buf_k, acl_q, acl_o, 1);
+                aclnn_muls(ctx, acl_o, scale, nullptr, true);
+                ggml_cann_release_resources(ctx, acl_q, acl_k, acl_v, acl_o, acl_g, acl_buf_k, acl_buf_v, acl_k_rep, acl_v_rep);
+            }
+            ggml_cann_release_resources(ctx, acl_s, acl_s_new);
+        }
+    }
+}
+
 void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
 
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
index 5c510cc9932..6cc830b0eea 100755
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -187,6 +187,18 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
  */
 void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
+/**
+ * @brief   Computes the Gated Linear Attention for a ggml tensor using the CANN
+ *          backend.
+ *
+ * @details ...
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the normalized values will be stored.
+ * @attention ...
+ */
+void ggml_cann_gated_linear_attn(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 /**
  * @brief  Computes the Group Normalization for a ggml tensor using the CANN
  *         backend.
@@ -605,6 +617,10 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     aclTensor* acl_dst);
 
+static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst);
+static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                          aclTensor* acl_dst, int64_t* new_dim, uint64_t dims);
+
 /**
  * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
  * output tensor.
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index cb8af42ebf9..016fbe44ca1 100755
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1881,6 +1881,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_FLASH_ATTN_EXT:
             ggml_cann_flash_attn_ext(ctx, dst);
             break;
+        case GGML_OP_GATED_LINEAR_ATTN:
+            ggml_cann_gated_linear_attn(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -2493,6 +2496,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_MEAN:
         case GGML_OP_PAD_REFLECT_1D:
         case GGML_OP_COUNT_EQUAL:
+        case GGML_OP_GATED_LINEAR_ATTN:
             return true;
         case GGML_OP_SCALE:
             float bias;

From ff7919e87e2e7b7e83ae143ea99245f5627fd6ac Mon Sep 17 00:00:00 2001
From: ewykric <wangyukai@stu.pku.edu.cn>
Date: Wed, 26 Nov 2025 18:20:04 +0800
Subject: [PATCH 2/3] CANN: aclnn_ops.cpp gatedlinearattn optimization

---
 ggml/src/ggml-cann/aclnn_ops.cpp | 128 ++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 53 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 2a359012f33..a33d6507a4b 100755
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -441,90 +441,112 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 }
 
 void ggml_cann_gated_linear_attn(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // 获取输入张量
     ggml_tensor * k  = dst->src[0];
     ggml_tensor * v  = dst->src[1];
     ggml_tensor * q  = dst->src[2];
     ggml_tensor * g = dst->src[3];
     ggml_tensor * s  = dst->src[4];
 
-    int64_t B = dst->src[4]->ne[1];
-    int64_t T = dst->src[0]->ne[2];
-    int64_t H = dst->src[0]->ne[1];
-    int64_t C = dst->ne[0];
-    int64_t D = C / H;
-    int64_t L = T / B;
-
-    int64_t ne_qkg[2] = {1, D};
-    // int64_t ne_qkg[2] = {D, 1};
-    int64_t ne_s[2] = {D, D};
-    int64_t ne_vo[2] = {D, 1};
-    // int64_t ne_vo[2] = {1, D};
-    int64_t ne_q[1] = {D};
+    // 计算维度参数
+    int64_t B = dst->src[4]->ne[1]; // Batch size
+    int64_t T = dst->src[0]->ne[2]; // Total sequence length
+    int64_t H = dst->src[0]->ne[1]; // Number of heads
+    int64_t C = dst->ne[0];         // Total channels
+    int64_t D = C / H;              // Dimensionality per head
+    int64_t L = T / B;              // Sequence length per batch
+
+    // 设置张量维度和步长信息
+    int64_t ne_qkg[2] = {1, D};     // k/g的形状 [1,D]
+    int64_t ne_s[2] = {D, D};       // 状态张量形状 [D,D]
+    int64_t ne_vo[2] = {D, 1};      // v的形状 [D,1]
+    int64_t ne_q[1] = {D};          // q/o的形状 [D]
+    
+    // 计算步长（内存布局）
     size_t nb_base = ggml_type_size(k->type);
     size_t nb_qkg[2] = {nb_base, nb_base};
     size_t nb_s[2] = {nb_base, D * nb_base};
     size_t nb_vo[2] = {nb_base, D * nb_base};
     size_t nb_q[1] = {nb_base};
 
+    // 获取缩放因子
     float scale;
     memcpy(&scale, dst->op_params, sizeof(float));
 
+    // 预分配缓冲区，避免在循环中重复分配（性能优化1）
+    size_t buf_size = D * D * sizeof(float);
+    ggml_cann_pool_alloc state_buf1(ctx.pool(), buf_size);
+    void* buf1_ptr = state_buf1.get();
+    ggml_cann_pool_alloc state_buf2(ctx.pool(), buf_size);
+    void* buf2_ptr = state_buf2.get();
+    
+    // 创建可重用的缓冲区张量（性能优化2）
+    aclTensor* acl_buf_k = ggml_cann_create_tensor(buf1_ptr, ggml_cann_type_mapping(k->type), 
+                                                  ggml_type_size(k->type), ne_s, nb_s, 2);
+    aclTensor* acl_buf_v = ggml_cann_create_tensor(buf2_ptr, ggml_cann_type_mapping(k->type), 
+                                                  ggml_type_size(k->type), ne_s, nb_s, 2);
+    
+    // 预创建重复参数数组（性能优化3）
+    int64_t k_rep[2] = {1, D};      // k/g重复模式 [1,D] -> [D,D]
+    int64_t v_rep[2] = {D, 1};      // v重复模式 [D,1] -> [D,D]
+    aclIntArray* acl_k_rep = aclCreateIntArray(k_rep, 2);
+    aclIntArray* acl_v_rep = aclCreateIntArray(v_rep, 2);
+    
+    // 定义转置维度
+    int64_t newdim[2] = {1, 0};      // [D,D] -> [D,D] (转置)
+    
+    // 遍历批次、头和时间步
     for (int64_t b = 0; b < B; b++) {
         for (int64_t h = 0; h < H; h++) {
+            // 计算状态张量的偏移量
             size_t s_offset = (b * (H * D * D) + h * (D * D)) * nb_base;
-            // D * D
+            
+            // 创建状态张量
             aclTensor* acl_s = ggml_cann_create_tensor(s, ne_s, nb_s, 2, ACL_FORMAT_ND, s_offset);
-            aclTensor* acl_s_new = ggml_cann_create_tensor(dst, ne_s, nb_s, 2, ACL_FORMAT_ND, (B * L * H * D) * nb_base + s_offset);
+            aclTensor* acl_s_new = ggml_cann_create_tensor(dst, ne_s, nb_s, 2, ACL_FORMAT_ND, 
+                                                          (B * L * H * D) * nb_base + s_offset);
+            
+            // 复制初始状态
             cann_copy(ctx, acl_s, acl_s_new);
+            
+            // 遍历时间步，更新状态并计算输出
             for (int64_t l = 0; l < L; l++) {
+                // 计算当前时间步的qkvgo偏移量
                 size_t qkvgo_offset = (b * (L * H * D) + l * (H * D) + h * (D)) * nb_base;
-                // D * 1
+                
+                // 创建当前时间步所需的张量
                 aclTensor* acl_k = ggml_cann_create_tensor(k, ne_qkg, nb_qkg, 2, ACL_FORMAT_ND, qkvgo_offset);
                 aclTensor* acl_g = ggml_cann_create_tensor(g, ne_qkg, nb_qkg, 2, ACL_FORMAT_ND, qkvgo_offset);
-                // D
                 aclTensor* acl_q = ggml_cann_create_tensor(q, ne_q, nb_q, 1, ACL_FORMAT_ND, qkvgo_offset);
-                // 1 * D
                 aclTensor* acl_v = ggml_cann_create_tensor(v, ne_vo, nb_vo, 2, ACL_FORMAT_ND, qkvgo_offset);
-                // D
                 aclTensor* acl_o = ggml_cann_create_tensor(dst, ne_q, nb_q, 1, ACL_FORMAT_ND, qkvgo_offset);
-                // repeat k and v
-                // buffer for repeated k
-                size_t buf_size = D * D * sizeof(float);
-                ggml_cann_pool_alloc state_buf1(ctx.pool(), buf_size);
-                void* buf1_ptr = state_buf1.get();
-                aclTensor* acl_buf_k = ggml_cann_create_tensor(buf1_ptr, ggml_cann_type_mapping(k->type), ggml_type_size(k->type), ne_s, nb_s, 2);
-                // buffer for repeated v
-                ggml_cann_pool_alloc state_buf2(ctx.pool(), buf_size);
-                void* buf2_ptr = state_buf2.get();
-                aclTensor* acl_buf_v = ggml_cann_create_tensor(buf2_ptr, ggml_cann_type_mapping(k->type), ggml_type_size(k->type), ne_s, nb_s, 2);
-                // repeat
-                int64_t k_rep[2] = {1, D};
-                int64_t v_rep[2] = {D, 1};
-                // int64_t k_rep[2] = {D, 1};
-                // int64_t v_rep[2] = {1, D};
-                aclIntArray* acl_k_rep = aclCreateIntArray(k_rep, 2);
-                aclIntArray* acl_v_rep = aclCreateIntArray(v_rep, 2);
-                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_k, acl_k_rep, acl_buf_k);
-                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_v, acl_v_rep, acl_buf_v);
-                // inplace mul, saved in acl_buf_k
-                aclnn_mul(ctx, acl_buf_k, acl_buf_v, nullptr);
-                // apply g to s
-                // reuse acl_buf_v to store repeated g
-                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_g, acl_k_rep, acl_buf_v);
-                aclnn_mul(ctx, acl_s_new, acl_buf_v, nullptr);
-                // add kv
-                aclnn_add(ctx, acl_s_new, acl_buf_k, nullptr);
-                // compute output
-                // permute state and store in acl_buf k
-                int64_t newdim[2] = {1, 0};
-                aclnn_permute(ctx, acl_s_new, acl_buf_k, newdim, 2);
-                GGML_CANN_CALL_ACLNN_OP(ctx, Mv, acl_buf_k, acl_q, acl_o, 1);
-                aclnn_muls(ctx, acl_o, scale, nullptr, true);
-                ggml_cann_release_resources(ctx, acl_q, acl_k, acl_v, acl_o, acl_g, acl_buf_k, acl_buf_v, acl_k_rep, acl_v_rep);
+                
+                // 1. 计算k*v外积
+                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_k, acl_k_rep, acl_buf_k); // k广播到[D,D]
+                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_v, acl_v_rep, acl_buf_v); // v广播到[D,D]
+                aclnn_mul(ctx, acl_buf_k, acl_buf_v, nullptr); // 元素级乘法 k*v
+                
+                // 2. 应用门控并更新状态
+                GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_g, acl_k_rep, acl_buf_v); // g广播到[D,D]
+                aclnn_mul(ctx, acl_s_new, acl_buf_v, nullptr); // 门控操作: s = s * g
+                aclnn_add(ctx, acl_s_new, acl_buf_k, nullptr); // 状态更新: s = s + k*v
+                
+                // 3. 计算输出
+                aclnn_permute(ctx, acl_s_new, acl_buf_k, newdim, 2); // 转置状态矩阵
+                GGML_CANN_CALL_ACLNN_OP(ctx, Mv, acl_buf_k, acl_q, acl_o, 1); // 矩阵向量乘法: o = s^T * q
+                aclnn_muls(ctx, acl_o, scale, nullptr, true); // 应用缩放因子
+                
+                // 释放当前时间步的临时张量
+                ggml_cann_release_resources(ctx, acl_q, acl_k, acl_v, acl_o, acl_g);
             }
+            
+            // 释放状态张量
             ggml_cann_release_resources(ctx, acl_s, acl_s_new);
         }
     }
+    
+    // 释放预分配的资源
+    ggml_cann_release_resources(ctx, acl_buf_k, acl_buf_v, acl_k_rep, acl_v_rep);
 }
 
 void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

From 065232942af39e76a6dde63601d3e52404610faf Mon Sep 17 00:00:00 2001
From: ewykric <wangyukai@stu.pku.edu.cn>
Date: Thu, 4 Dec 2025 15:30:12 +0800
Subject: [PATCH 3/3] CANN: aclnn_ops.cpp gatedlinearattn optimization

---
 docs/ops/CANN.csv | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/ops/CANN.csv b/docs/ops/CANN.csv
index 0ac1078304a..8170c9755a7 100644
--- a/docs/ops/CANN.csv
+++ b/docs/ops/CANN.csv
@@ -3292,10 +3292,10 @@
 "CANN0","RWKV_WKV7","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=1","support","0","no","CANN"
 "CANN0","RWKV_WKV7","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=4","support","0","no","CANN"
 "CANN0","RWKV_WKV7","type=f32,head_count=32,head_size=64,n_seq_tokens=128,n_seqs=4","support","0","no","CANN"
-"CANN0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=1,n_seqs=1","support","0","no","CANN"
-"CANN0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=1","support","0","no","CANN"
-"CANN0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=4","support","0","no","CANN"
-"CANN0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=128,n_seqs=4","support","0","no","CANN"
+"CANN0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=1,n_seqs=1","support","1","yes","CANN"
+"CANN0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=1","support","1","yes","CANN"
+"CANN0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=4","support","1","yes","CANN"
+"CANN0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=128,n_seqs=4","support","1","yes","CANN"
 "CANN0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0","support","1","yes","CANN"
 "CANN0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0","support","1","yes","CANN"
 "CANN0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0","support","1","yes","CANN"