Skip to content

Commit 9961d24

Browse files
authored
CANN: Resolve soft_max precision issue (#15730)
Previously, the slope tensor was set to fp16 to improve efficiency. While this worked correctly in FA, it caused precision issues in soft_max. This change applies different data types for different operators to balance both accuracy and performance.
1 parent 25f1045 commit 9961d24

File tree

1 file changed

+15
-10
lines changed

1 file changed

+15
-10
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,21 +1425,25 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
14251425
* @param start Starting exponent offset.
14261426
* @param stop Stopping exponent offset (exclusive).
14271427
* @param step Step size for the exponent increment.
1428+
* @param dtype Data type for slope tensor.
14281429
*/
14291430
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
1430-
float m, int64_t size, float start, float stop, float step){
1431+
float m, int64_t size, float start, float stop, float step, ggml_type dtype){
1432+
aclDataType acl_type = ggml_cann_type_mapping(dtype);
1433+
size_t type_size = ggml_type_size(dtype);
1434+
14311435
int64_t ne[] = {size};
1432-
size_t nb[] = {sizeof(uint16_t)};
1436+
size_t nb[] = {type_size};
14331437

1434-
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(uint16_t));
1438+
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
14351439
void* arange_buffer = arange_allocator.get();
14361440

14371441
aclTensor* arange_tensor = ggml_cann_create_tensor(
1438-
arange_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
1442+
arange_buffer, acl_type, type_size, ne, nb, 1);
14391443
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
14401444

14411445
aclTensor* slope_tensor = ggml_cann_create_tensor(
1442-
slope_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
1446+
slope_buffer, acl_type, type_size, ne, nb, 1);
14431447

14441448
aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
14451449

@@ -1470,10 +1474,11 @@ static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_bu
14701474
* @param n_head Total number of attention heads.
14711475
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
14721476
* @param max_bias Maximum bias value for slope computation.
1477+
* @param dtype Data type for slope tensor.
14731478
*
14741479
*/
14751480
static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
1476-
void* slope_buffer, float max_bias) {
1481+
void* slope_buffer, float max_bias, ggml_type dtype) {
14771482
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
14781483

14791484
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
@@ -1490,7 +1495,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
14901495
float step = 1;
14911496
float count = n_head_log2;
14921497
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
1493-
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
1498+
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step, dtype);
14941499
if (n_head_log2 < n_head) {
14951500
// arange2
14961501
start = 2 * (n_head_log2 - n_head_log2) + 1;
@@ -1499,7 +1504,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
14991504
count = n_head - n_head_log2;
15001505
aclnn_get_slope_inner(
15011506
ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
1502-
m1, count, start, end + 1, step);
1507+
m1, count, start, end + 1, step, dtype);
15031508
}
15041509
}
15051510

@@ -1536,7 +1541,7 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
15361541
ggml_cann_pool_alloc bias_allocator(
15371542
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
15381543
bias_buffer = bias_allocator.get();
1539-
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
1544+
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
15401545
}
15411546

15421547
// broadcast for mask, slop and dst;
@@ -3269,7 +3274,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
32693274
const int64_t n_heads = src0->ne[2];
32703275
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
32713276
void* slope_buffer = slope_allocator.get();
3272-
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
3277+
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
32733278

32743279
int64_t slope_ne[] = {1, 1, n_heads, 1};
32753280
size_t slope_nb[GGML_MAX_DIMS];

0 commit comments

Comments
 (0)