From 0ff18e0cda37e2a98b0e95b5ef9bb4f8fff59e6b Mon Sep 17 00:00:00 2001 From: YangShuai52 Date: Thu, 14 Aug 2025 16:51:58 +0800 Subject: [PATCH 1/4] optimize rope ops --- ggml/src/ggml-cann/aclnn_ops.cpp | 170 +++++++++++++++++++------------ ggml/src/ggml-cann/common.h | 13 +++ 2 files changed, 118 insertions(+), 65 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 259a2928b1f36..718a41d9e013a 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2154,87 +2154,128 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, GGML_TENSOR_BINARY_OP_LOCALS - // theta_scale arange, [0,1,...,ne00/2 - 1] int64_t theta_scale_length = ne00 / 2; - ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(), - theta_scale_length * sizeof(float_t)); - void* theta_scale_buffer = theta_scale_allocator.get(); int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1}; size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t), theta_scale_length * sizeof(float_t)}; - aclTensor* acl_theta_scale_tensor = - ggml_cann_create_tensor(theta_scale_buffer, ACL_FLOAT, sizeof(float_t), - theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); - float start = 0; - float step = 1; - float stop = ne00 / 2; - float n_elements = ne00 / 2; - aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements); - - // power - aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); - GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, - acl_theta_scale_tensor); - - // freq_scale - if (freq_scale != 1) { - aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); - } - - // freq_factors - if (src2) { - aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( - src2->data, ggml_cann_type_mapping(src2->type), - ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); - aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor); - ggml_cann_release_resources(ctx, acl_freq_factors_tensor); - } - - // position GGML_ASSERT(src1->type == GGML_TYPE_I32); int64_t position_length = src1->ne[0]; int64_t position_ne[] = {1, 1, position_length, 1}; size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length}; - aclTensor* acl_position_tensor = ggml_cann_create_tensor( - src1->data, ggml_cann_type_mapping(src1->type), - ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); - - // power * position - int64_t theta_length = theta_scale_length * position_length; - ggml_cann_pool_alloc theta_allocator(ctx.pool(), - theta_length * sizeof(float_t)); - void* theta_buffer = theta_allocator.get(); + int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1}; size_t theta_nb[GGML_MAX_DIMS]; theta_nb[0] = sizeof(float_t); for (int i = 1; i < GGML_MAX_DIMS; i++) { theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1]; } - aclTensor* acl_theta_tensor = - ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), - theta_ne, theta_nb, GGML_MAX_DIMS); - aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, - acl_theta_tensor); - - // sin/cos - ggml_cann_pool_alloc sin_allocator(ctx.pool(), - theta_length * sizeof(float_t)); - void* sin_buffer = sin_allocator.get(); - aclTensor* acl_sin_tensor = ggml_cann_create_tensor( - sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, - GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor); - ggml_cann_pool_alloc cos_allocator(ctx.pool(), - theta_length * sizeof(float_t)); - void* cos_buffer = cos_allocator.get(); - aclTensor* acl_cos_tensor = ggml_cann_create_tensor( - cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, - GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor); + bool is_q = (std::strncmp(dst->name, "Qcur-", 5) == 0); + bool is_k = (std::strncmp(dst->name, "Kcur-", 5) == 0); + bool is_attention = is_q || is_k; + if(ctx.init_ptr == nullptr || !is_attention) { + // theta_scale arange, [0,1,...,ne00/2 - 1] + if(ctx.init_ptr != nullptr){ + ACL_CHECK(aclrtFree(ctx.init_ptr)); + } + ACL_CHECK(aclrtMalloc(&ctx.init_ptr,theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + + aclTensor* acl_theta_scale_tensor = + ggml_cann_create_tensor(ctx.init_ptr, ACL_FLOAT, sizeof(float_t), + theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + float start = 0; + float step = 1; + float stop = ne00 / 2; + float n_elements = ne00 / 2; + aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements); + + // power + aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, + acl_theta_scale_tensor); + + // freq_scale + if (freq_scale != 1) { + aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); + } + + // freq_factors + if (src2) { + aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( + src2->data, ggml_cann_type_mapping(src2->type), + ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor); + ggml_cann_release_resources(ctx, acl_freq_factors_tensor); + } + // release + ggml_cann_release_resources(ctx, acl_theta_scale_tensor,acl_theta_scale); + } + + if(ctx.sin_ptr == nullptr) { + int64_t theta_length = theta_scale_length * ctx.max_position_length; + ACL_CHECK(aclrtMalloc(&ctx.sin_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&ctx.cos_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + } + if(position_length > ctx.max_position_length) { + ctx.max_position_length = position_length; + int64_t theta_length = theta_scale_length * ctx.max_position_length; + ACL_CHECK(aclrtFree(ctx.sin_ptr)); + ACL_CHECK(aclrtFree(ctx.cos_ptr)); + ACL_CHECK(aclrtMalloc(&ctx.sin_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&ctx.cos_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + } + + bool is_fisrt_layer = (std::strncmp(dst->name, "Qcur-0", GGML_MAX_NAME) == 0); + + if(is_fisrt_layer || !is_attention) { + + aclTensor* acl_theta_scale_tensor = + ggml_cann_create_tensor(ctx.init_ptr, ACL_FLOAT, sizeof(float_t), + theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + + // position + aclTensor* acl_position_tensor = ggml_cann_create_tensor( + src1->data, ggml_cann_type_mapping(src1->type), + ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); + + // power * position + int64_t theta_length = theta_scale_length * position_length; + ggml_cann_pool_alloc theta_allocator(ctx.pool(), + theta_length * sizeof(float_t)); + void* theta_buffer = theta_allocator.get(); + + aclTensor* acl_theta_tensor = + ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), + theta_ne, theta_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, + acl_theta_tensor); + + // sin/cos + aclTensor* acl_sin_tensor = ggml_cann_create_tensor( + ctx.sin_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor); + + aclTensor* acl_cos_tensor = ggml_cann_create_tensor( + ctx.cos_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor); + + // release + ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, + acl_theta_tensor, acl_sin_tensor, acl_cos_tensor); + } + + aclTensor* acl_sin_tensor = ggml_cann_create_tensor( + ctx.sin_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + aclTensor* acl_cos_tensor = ggml_cann_create_tensor( + ctx.cos_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + // attn_factor if (attn_factor != 1) { aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true); @@ -2257,8 +2298,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, } // release - ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, - acl_theta_tensor, acl_sin_tensor, acl_cos_tensor, acl_theta_scale); + ggml_cann_release_resources(ctx, acl_sin_tensor, acl_cos_tensor); } #ifdef __cplusplus diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 9d294f72b6779..9a00f55f8d5ad 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -368,6 +368,10 @@ struct ggml_backend_cann_context { std::string name; /**< Name of the device. */ std::string description; /**< Description of the device. */ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ + void* init_ptr = nullptr; + void* sin_ptr = nullptr; + void* cos_ptr = nullptr; + int64_t max_position_length = 200000; #ifdef USE_ACL_GRAPH /// Cached CANN ACL graph used for executing the current ggml computation graph. std::unique_ptr cann_graph; @@ -414,6 +418,15 @@ struct ggml_backend_cann_context { ACL_CHECK(aclrtDestroyStream(streams[i])); } } + if(init_ptr != nullptr) { + ACL_CHECK(aclrtFree(init_ptr)); + } + if(sin_ptr != nullptr) { + ACL_CHECK(aclrtFree(sin_ptr)); + } + if(cos_ptr != nullptr) { + ACL_CHECK(aclrtFree(cos_ptr)); + } } /** From 930ee57153377a1f4cf96fa8d0579e41fc956699 Mon Sep 17 00:00:00 2001 From: YangShuai52 Date: Fri, 15 Aug 2025 11:09:17 +0800 Subject: [PATCH 2/4] amendment --- ggml/src/ggml-cann/aclnn_ops.cpp | 4 +++- ggml/src/ggml-cann/common.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 718a41d9e013a..c4811c7715be7 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2174,6 +2174,8 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, bool is_q = (std::strncmp(dst->name, "Qcur-", 5) == 0); bool is_k = (std::strncmp(dst->name, "Kcur-", 5) == 0); + + // used for accuracy testing bool is_attention = is_q || is_k; if(ctx.init_ptr == nullptr || !is_attention) { @@ -2181,7 +2183,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, if(ctx.init_ptr != nullptr){ ACL_CHECK(aclrtFree(ctx.init_ptr)); } - ACL_CHECK(aclrtMalloc(&ctx.init_ptr,theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&ctx.init_ptr, theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); aclTensor* acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.init_ptr, ACL_FLOAT, sizeof(float_t), diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 9a00f55f8d5ad..2c2033bfba857 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -371,7 +371,7 @@ struct ggml_backend_cann_context { void* init_ptr = nullptr; void* sin_ptr = nullptr; void* cos_ptr = nullptr; - int64_t max_position_length = 200000; + int64_t max_prompt_length = 65536; #ifdef USE_ACL_GRAPH /// Cached CANN ACL graph used for executing the current ggml computation graph. std::unique_ptr cann_graph; From 682600c0cbcf6e4059359268dedff54f43f0e5a1 Mon Sep 17 00:00:00 2001 From: YangShuai52 Date: Fri, 15 Aug 2025 14:54:26 +0800 Subject: [PATCH 3/4] delete trailing whitespace --- ggml/src/ggml-cann/aclnn_ops.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index c4811c7715be7..7e4903b1f0507 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2184,7 +2184,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ACL_CHECK(aclrtFree(ctx.init_ptr)); } ACL_CHECK(aclrtMalloc(&ctx.init_ptr, theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); - + aclTensor* acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.init_ptr, ACL_FLOAT, sizeof(float_t), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); @@ -2198,7 +2198,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, acl_theta_scale_tensor); - + // freq_scale if (freq_scale != 1) { aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); @@ -2231,7 +2231,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, } bool is_fisrt_layer = (std::strncmp(dst->name, "Qcur-0", GGML_MAX_NAME) == 0); - + if(is_fisrt_layer || !is_attention) { aclTensor* acl_theta_scale_tensor = @@ -2242,13 +2242,13 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_position_tensor = ggml_cann_create_tensor( src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); - + // power * position int64_t theta_length = theta_scale_length * position_length; ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float_t)); void* theta_buffer = theta_allocator.get(); - + aclTensor* acl_theta_tensor = ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, GGML_MAX_DIMS); @@ -2277,7 +2277,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_cos_tensor = ggml_cann_create_tensor( ctx.cos_ptr, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); - + // attn_factor if (attn_factor != 1) { aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true); From 170d40e7167c9122f0b3092d767d950a2afce7d4 Mon Sep 17 00:00:00 2001 From: YangShuai52 Date: Mon, 18 Aug 2025 14:08:20 +0800 Subject: [PATCH 4/4] change the variable name --- ggml/src/ggml-cann/aclnn_ops.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 7e4903b1f0507..2a5cb8abfa137 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2217,13 +2217,13 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, } if(ctx.sin_ptr == nullptr) { - int64_t theta_length = theta_scale_length * ctx.max_position_length; + int64_t theta_length = theta_scale_length * ctx.max_prompt_length; ACL_CHECK(aclrtMalloc(&ctx.sin_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); ACL_CHECK(aclrtMalloc(&ctx.cos_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST)); } - if(position_length > ctx.max_position_length) { - ctx.max_position_length = position_length; - int64_t theta_length = theta_scale_length * ctx.max_position_length; + if(position_length > ctx.max_prompt_length) { + ctx.max_prompt_length = position_length; + int64_t theta_length = theta_scale_length * ctx.max_prompt_length; ACL_CHECK(aclrtFree(ctx.sin_ptr)); ACL_CHECK(aclrtFree(ctx.cos_ptr)); ACL_CHECK(aclrtMalloc(&ctx.sin_ptr, theta_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));