From e4b0eacbc55bd3d1b91fd4c153b8a595ce79c6ee Mon Sep 17 00:00:00 2001 From: zsq0216 <1120270284@qq.com> Date: Thu, 4 Dec 2025 10:32:03 +0800 Subject: [PATCH] CANN: add FP32 accum in ReduceSum and fix RMSNorm gamma dtype --- ggml/src/ggml-cann/aclnn_ops.cpp | 108 ++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index bc33b99d96e..bde1f66378c 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -516,17 +516,69 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param dim An array of dimension indices. * @param dim_size The number of dimensions. */ -static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst, +static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, + ggml_tensor* dst, int64_t* dim, size_t dim_size) { GGML_ASSERT(dst->ne[0] == 1); ggml_tensor* src = dst->src[0]; aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); aclIntArray* reduce_dims = aclCreateIntArray(dim, dim_size); - GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true, - ggml_cann_type_mapping(dst->type), acl_dst); - ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims); + bool use_fp32_accum = (dst->type != GGML_TYPE_F32); + + aclTensor* acl_dst = nullptr; + aclTensor* acl_tmp = nullptr; + + if (!use_fp32_accum) { + // write result directly into dst (original path) + acl_dst = ggml_cann_create_tensor(dst); + GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, + acl_src, reduce_dims, true, + ggml_cann_type_mapping(dst->type), acl_dst); + ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims); + } else { + // accumulate in FP32 first, then cast to dst type + size_t nelems = ggml_nelements(dst); + size_t tmp_bytes = nelems * sizeof(float); + + ggml_cann_pool_alloc tmp_buf(ctx.pool(), tmp_bytes); + void* tmp_data = tmp_buf.get(); + + // build temporary FP32 tensor + int64_t tmp_ne[GGML_MAX_DIMS]; + size_t tmp_nb[GGML_MAX_DIMS]; + + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + tmp_ne[i] = dst->ne[i]; + } + tmp_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + tmp_nb[i] = tmp_nb[i - 1] * tmp_ne[i - 1]; + } + + acl_tmp = ggml_cann_create_tensor(tmp_data, + ACL_FLOAT, + sizeof(float), + tmp_ne, tmp_nb, + GGML_MAX_DIMS, + ACL_FORMAT_ND); + + acl_dst = ggml_cann_create_tensor(dst); + + // ReduceSum → FP32 temp + GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, + acl_src, reduce_dims, true, + aclDataType::ACL_FLOAT, + acl_tmp); + + // cast FP32 → dst dtype + GGML_CANN_CALL_ACLNN_OP(ctx, Cast, + acl_tmp, + ggml_cann_type_mapping(dst->type), + acl_dst); + + ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_tmp, reduce_dims); + } } void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { @@ -956,38 +1008,34 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { float eps; memcpy(&eps, dst->op_params, sizeof(float)); - // build gamma, one... - size_t acl_gamma_nb[GGML_MAX_DIMS]; - acl_gamma_nb[0] = sizeof(float); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1]; - } - aclTensor* acl_gamma = get_f32_cache_acl_tensor( - ctx, - &ctx.f32_one_cache, - ctx.f32_one_cache_element, - src->ne, - acl_gamma_nb, - 1, // dims - 1.0f // value - ); - - // build rstd, zero... + // gamma: same dtype as dst, filled with 1.0 + const size_t gamma_elem_size = ggml_type_size(dst->type); + const aclDataType gamma_acl_dtype = ggml_cann_type_mapping(dst->type); + + int64_t gamma_ne[1] = { src->ne[0] }; + size_t gamma_nb[1] = { gamma_elem_size }; + + ggml_cann_pool_alloc gamma_allocator(ctx.pool(), gamma_ne[0] * gamma_elem_size); + void* gamma_buffer = gamma_allocator.get(); + + aclTensor* acl_gamma = ggml_cann_create_tensor( + gamma_buffer, gamma_acl_dtype, gamma_elem_size, + gamma_ne, gamma_nb, 1); + + aclnn_fill_scalar(ctx, 1.0f, acl_gamma); + + // rstd: keep FP32 as in original implementation size_t acl_rstd_nb[GGML_MAX_DIMS]; acl_rstd_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1]; } + aclTensor* acl_rstd = get_f32_cache_acl_tensor( - ctx, - &ctx.f32_zero_cache, - ctx.f32_zero_cache_element, - src->ne, - acl_rstd_nb, - GGML_MAX_DIMS, - 0.0f // value - ); + ctx, &ctx.f32_zero_cache, ctx.f32_zero_cache_element, + src->ne, acl_rstd_nb, GGML_MAX_DIMS, 0.0f); + // RMSNorm GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd); }