diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index bc33b99d96e..074fb303153 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -423,6 +423,98 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_release_resources(ctx, acl_src, tmp_tensor, acl_dst); } +void ggml_cann_l2_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(src && dst); + GGML_ASSERT(src->type == GGML_TYPE_F32); // input f32 + GGML_ASSERT(dst->type == GGML_TYPE_F32); // output f32 + + // Step 0: read eps from op_params, use double for more precision + double eps = 1e-6; + memcpy(&eps, dst->op_params, sizeof(double)); // double eps + + // Step 1: create acl tensors for src/dst + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + // Step 2: allocate temporary buffer: need 4 * src_bytes + size_t type_size = ggml_type_size(src->type); + size_t src_bytes = ggml_nbytes(src); + ggml_cann_pool_alloc tmp_alloc(ctx.pool(), src_bytes * 4); + void* buf = tmp_alloc.get(); + + // Step 3: sq = x * x (same shape as src) + int64_t ne_tmp[GGML_MAX_DIMS]; + size_t nb_tmp[GGML_MAX_DIMS]; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + ne_tmp[i] = src->ne[i]; + nb_tmp[i] = src->nb[i]; + } + aclTensor* acl_sq = ggml_cann_create_tensor( + buf, ACL_FLOAT, type_size, ne_tmp, nb_tmp, GGML_MAX_DIMS, ACL_FORMAT_ND); + GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_src, acl_sq); + + // Step 4: reduce sum over ggml axis 0 (innermost dim) -> map to ACL axis + const int ggml_reduce_axis = 0; + const int64_t acl_reduce_axis = ggml_n_dims(src) - 1 - ggml_reduce_axis; + int64_t reduce_dims[] = { acl_reduce_axis }; + aclIntArray* reduce_axis = aclCreateIntArray(reduce_dims, 1); + + // Build ne_sum: set reduced ggml axis to 1 + int64_t ne_sum[GGML_MAX_DIMS]; + for (int i = 0; i < GGML_MAX_DIMS; i++) ne_sum[i] = src->ne[i]; + ne_sum[ggml_reduce_axis] = 1; + + // nb for ne_sum + size_t nb_sum[GGML_MAX_DIMS]; + nb_sum[0] = ggml_type_size(src->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) nb_sum[i] = nb_sum[i - 1] * ne_sum[i - 1]; + + // Create acl_sum with ne_sum layout (explicit ACL_FLOAT) + aclTensor* acl_sum = ggml_cann_create_tensor( + (char*)buf + src_bytes, ACL_FLOAT, type_size, ne_sum, nb_sum, GGML_MAX_DIMS, ACL_FORMAT_ND); + + // IMPORTANT CHANGE: explicitly request REDUCE output type = ACL_FLOAT to force float accumulation/output + GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_sq, reduce_axis, true, + ACL_FLOAT, acl_sum); + + // Step 5: adds eps and sqrt, use double for higher precision + double eps_dbl = eps; // Use double to store eps for higher precision + aclScalar* eps_scalar = aclCreateScalar(&eps_dbl, ACL_FLOAT); // Explicitly create as float + double one_dbl = 1.0; + aclScalar* alpha_scalar = aclCreateScalar(&one_dbl, ACL_FLOAT); + + // Add eps: acl_sum = acl_sum + eps + GGML_CANN_CALL_ACLNN_OP(ctx, Adds, acl_sum, eps_scalar, alpha_scalar, acl_sum); + + // sqrt (use double precision for sqrt to avoid precision loss) + aclTensor* acl_sqrt = ggml_cann_create_tensor( + (char*)buf + src_bytes * 2, ACL_FLOAT, type_size, ne_sum, nb_sum, GGML_MAX_DIMS, ACL_FORMAT_ND); + GGML_CANN_CALL_ACLNN_OP(ctx, Sqrt, acl_sum, acl_sqrt); + + // Step 6: repeat sqrt back to src shape -> create target tensor for expanded sqrt + int64_t repeats[GGML_MAX_DIMS]; + repeats[0] = (ne_sum[3] == 0) ? 1 : (src->ne[3] / ne_sum[3]); + repeats[1] = (ne_sum[2] == 0) ? 1 : (src->ne[2] / ne_sum[2]); + repeats[2] = (ne_sum[1] == 0) ? 1 : (src->ne[1] / ne_sum[1]); + repeats[3] = (ne_sum[0] == 0) ? 1 : (src->ne[0] / ne_sum[0]); + + // allocate target expanded tensor (same shape as src) + aclTensor* acl_sqrt_rep = ggml_cann_create_tensor( + (char*)buf + src_bytes * 3, ACL_FLOAT, type_size, src->ne, src->nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + + // perform repeat: small -> expanded + aclnn_repeat(ctx, acl_sqrt, acl_sqrt_rep, repeats); + + // Step 7: divide x / sqrt_sum_expanded + GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_sqrt_rep, acl_dst); + + // Step 8: release + ggml_cann_release_resources(ctx, + reduce_axis, eps_scalar, alpha_scalar, + acl_src, acl_sq, acl_sum, acl_sqrt, acl_sqrt_rep, acl_dst); +} + void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 5c510cc9932..1f5cb33341e 100755 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -185,6 +185,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the normalized values will be stored. * @attention `Var` defaults to dst->ne[0]. */ +void ggml_cann_l2_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); + void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); /** diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index cb8af42ebf9..bed1aac29e4 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1771,6 +1771,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, return false; } break; + + case GGML_OP_L2_NORM: + ggml_cann_l2_norm(ctx, dst); + break; case GGML_OP_NORM: ggml_cann_norm(ctx, dst); break; @@ -2466,6 +2470,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: + case GGML_OP_L2_NORM: case GGML_OP_NORM: case GGML_OP_ADD: case GGML_OP_ADD1: