Skip to content

Commit 3f12537

Browse files
committed
CANN: add high performance mode using FP16 for intermediate states
Introduce a high performance mode for the CANN backend. In this mode, intermediate computation states are stored in FP16, which improves execution performance at the cost of slightly reduced precision.
1 parent e00f3fd commit 3f12537

File tree

6 files changed

+150
-72
lines changed

6 files changed

+150
-72
lines changed

docs/backend/CANN.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,3 +322,7 @@ Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. Whe
322322
### GGML_CANN_PREFILL_USE_GRAPH
323323

324324
Enable ACL graph execution during the prefill stage, default is false. This option is only effective when FA is enabled.
325+
326+
### GGML_CANN_HIGH_PERF_MODE
327+
328+
Enable high performance mode. Intermediate computation states are stored in FP16, which improves speed but may slightly reduce precision.

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 90 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1765,35 +1765,31 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
17651765
ggml_tensor* src0 = dst->src[0]; // src
17661766
ggml_tensor* src1 = dst->src[1]; // index
17671767

1768-
switch (src0->type) {
1769-
case GGML_TYPE_F32: {
1768+
if(src0->type == dst->type) {
1769+
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
17701770
aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
17711771
dst->data, dst->ne, dst->nb,
17721772
src1, dst->type);
1773-
break;
1774-
}
1775-
case GGML_TYPE_F16: {
1773+
} else if(src0->type == GGML_TYPE_F16) {
17761774
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
17771775
ggml_cann_pool_alloc src_buffer_allocator(
1778-
ctx.pool(), ggml_nelements(src0) * sizeof(float));
1776+
ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
17791777
void* src_trans_buffer = src_buffer_allocator.get();
17801778
size_t src_trans_nb[GGML_MAX_DIMS];
1781-
src_trans_nb[0] = sizeof(float);
1779+
src_trans_nb[0] = dst->nb[0];
17821780
for (int i = 1; i < GGML_MAX_DIMS; i++) {
17831781
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
17841782
}
17851783
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
1786-
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
1784+
src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
17871785
src0->ne, src_trans_nb, GGML_MAX_DIMS);
17881786
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
17891787
aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
17901788
dst->data, dst->ne, dst->nb,
17911789
src1, dst->type);
17921790
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
1793-
break;
1794-
}
1795-
case GGML_TYPE_Q8_0: {
1796-
// add 1 dim for bcast mul.
1791+
} else if (src0->type == GGML_TYPE_Q8_0){
1792+
// add 1 dim for bcast mul.
17971793
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
17981794
dequant_nb[GGML_MAX_DIMS + 1];
17991795
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
@@ -1854,11 +1850,8 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
18541850
src1, dst->type);
18551851

18561852
ggml_cann_release_resources(ctx, dequant_tensor);
1857-
break;
1858-
}
1859-
default:
1853+
} else {
18601854
GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
1861-
break;
18621855
}
18631856
}
18641857

@@ -3178,7 +3171,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
31783171
aclTensor* acl_src0_f16_tensor = nullptr;
31793172
aclTensor* acl_src1_f16_tensor = nullptr;
31803173
aclTensor* acl_src2_f16_tensor = nullptr;
3181-
aclTensor* acl_dst_f16_tensor = nullptr;
31823174

31833175
// Step 1: cast the src0 (Query) to fp16 if needed
31843176
ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
@@ -3216,22 +3208,6 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
32163208
acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne,
32173209
src2_bsnd_nb, GGML_MAX_DIMS);
32183210

3219-
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
3220-
void* out_f16_buffer = out_f16_allocator.alloc(
3221-
ggml_nelements(dst) * faElemSize);
3222-
3223-
int64_t* out_f16_ne = src0_bsnd_ne;
3224-
size_t out_f16_nb[GGML_MAX_DIMS];
3225-
out_f16_nb[0] = faElemSize;
3226-
for(int i = 1; i < GGML_MAX_DIMS; ++i){
3227-
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
3228-
}
3229-
3230-
acl_dst_f16_tensor = ggml_cann_create_tensor(
3231-
out_f16_buffer, faDataType, faElemSize,
3232-
out_f16_ne, out_f16_nb, GGML_MAX_DIMS
3233-
);
3234-
32353211
// Step 3: create the PSEShift tensor if needed
32363212
// this tensor is considered as mask (f16) in the llama.cpp
32373213
aclTensor* bcast_pse_tensor = nullptr;
@@ -3336,40 +3312,88 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
33363312

33373313
// Step 5: launch the FusedInferAttentionScoreV2 kernel.
33383314
// Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
3315+
if (dst->type == GGML_TYPE_F16) {
3316+
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
3317+
3318+
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
3319+
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
3320+
bcast_pse_tensor, nullptr, // pse, mask
3321+
nullptr, nullptr, // actSeqLen, actSeqLenkv
3322+
nullptr, nullptr, // deqScale1, quantScale1
3323+
nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
3324+
nullptr, nullptr, // antiquantScale, antiquantOffset
3325+
nullptr, // blockTable
3326+
nullptr, nullptr, // qPadSize, kvPadSize
3327+
nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
3328+
nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
3329+
nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
3330+
numHeads, scaleValue, // heads, scaleValue
3331+
preTokens, nextTokens, // preTokens, nextTokens
3332+
layout, // inputLayout
3333+
numKeyValueHeads, // numKVHeads
3334+
sparseMode, innerPrecise, // sparseMode, innerPrecise
3335+
blockSize, antiquantMode, // blockSize, antiquantMode
3336+
softmaxLseFlag, // softmaxLseFlag
3337+
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
3338+
acl_dst_tensor, // attentionOut
3339+
nullptr // softmaxLse
3340+
);
3341+
3342+
ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
3343+
acl_src1_f16_tensor,
3344+
acl_src2_f16_tensor,
3345+
acl_dst_tensor);
3346+
} else {
3347+
aclTensor* acl_dst_f16_tensor = nullptr;
3348+
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
3349+
void* out_f16_buffer = out_f16_allocator.alloc(
3350+
ggml_nelements(dst) * faElemSize);
3351+
3352+
int64_t* out_f16_ne = src0_bsnd_ne;
3353+
size_t out_f16_nb[GGML_MAX_DIMS];
3354+
out_f16_nb[0] = faElemSize;
3355+
for(int i = 1; i < GGML_MAX_DIMS; ++i){
3356+
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
3357+
}
3358+
3359+
acl_dst_f16_tensor = ggml_cann_create_tensor(
3360+
out_f16_buffer, faDataType, faElemSize,
3361+
out_f16_ne, out_f16_nb, GGML_MAX_DIMS
3362+
);
3363+
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
3364+
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
3365+
bcast_pse_tensor, nullptr, // pse, mask
3366+
nullptr, nullptr, // actSeqLen, actSeqLenkv
3367+
nullptr, nullptr, // deqScale1, quantScale1
3368+
nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
3369+
nullptr, nullptr, // antiquantScale, antiquantOffset
3370+
nullptr, // blockTable
3371+
nullptr, nullptr, // qPadSize, kvPadSize
3372+
nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
3373+
nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
3374+
nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
3375+
numHeads, scaleValue, // heads, scaleValue
3376+
preTokens, nextTokens, // preTokens, nextTokens
3377+
layout, // inputLayout
3378+
numKeyValueHeads, // numKVHeads
3379+
sparseMode, innerPrecise, // sparseMode, innerPrecise
3380+
blockSize, antiquantMode, // blockSize, antiquantMode
3381+
softmaxLseFlag, // softmaxLseFlag
3382+
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
3383+
acl_dst_f16_tensor, // attentionOut
3384+
nullptr // softmaxLse
3385+
);
3386+
// Step 6: post-processing, permute and cast to f32
3387+
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
3388+
// TODO: when dst is fp16, don't need cast
3389+
aclnn_cast(ctx, acl_dst_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
3390+
ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
3391+
acl_src1_f16_tensor,
3392+
acl_src2_f16_tensor,
3393+
acl_dst_f16_tensor,
3394+
acl_dst_tensor);
3395+
}
33393396

3340-
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
3341-
acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
3342-
bcast_pse_tensor, nullptr, // pse, mask
3343-
nullptr, nullptr, // actSeqLen, actSeqLenkv
3344-
nullptr, nullptr, // deqScale1, quantScale1
3345-
nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
3346-
nullptr, nullptr, // antiquantScale, antiquantOffset
3347-
nullptr, // blockTable
3348-
nullptr, nullptr, // qPadSize, kvPadSize
3349-
nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
3350-
nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
3351-
nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
3352-
numHeads, scaleValue, // heads, scaleValue
3353-
preTokens, nextTokens, // preTokens, nextTokens
3354-
layout, // inputLayout
3355-
numKeyValueHeads, // numKVHeads
3356-
sparseMode, innerPrecise, // sparseMode, innerPrecise
3357-
blockSize, antiquantMode, // blockSize, antiquantMode
3358-
softmaxLseFlag, // softmaxLseFlag
3359-
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
3360-
acl_dst_f16_tensor, // attentionOut
3361-
nullptr // softmaxLse
3362-
);
3363-
3364-
// Step 6: post-processing, permute and cast to f32
3365-
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
3366-
// TODO: when dst is fp16, don't need cast
3367-
aclnn_cast(ctx, acl_dst_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
3368-
ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
3369-
acl_src1_f16_tensor,
3370-
acl_src2_f16_tensor,
3371-
acl_dst_f16_tensor,
3372-
acl_dst_tensor);
33733397
if(src3 != nullptr){
33743398
ggml_cann_release_resources(ctx, bcast_pse_tensor);
33753399
}

ggml/src/ggml-cpu/ops.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5357,9 +5357,14 @@ static void ggml_compute_forward_get_rows_f16(
53575357

53585358
GGML_ASSERT(i01 >= 0 && i01 < ne01);
53595359

5360-
ggml_cpu_fp16_to_fp32(
5361-
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
5362-
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
5360+
if (dst->type == GGML_TYPE_F16)
5361+
ggml_vec_cpy_f16(nc,
5362+
(ggml_fp16_t *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
5363+
(ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
5364+
else
5365+
ggml_cpu_fp16_to_fp32(
5366+
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
5367+
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
53635368
}
53645369
}
53655370

ggml/src/ggml-cpu/vec.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp
8888
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
8989
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
9090
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
91+
inline static void ggml_vec_cpy_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
9192
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
9293
for (int i = 0; i < n; ++i) {
9394
y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));

ggml/src/ggml.c

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <float.h>
3333
#include <limits.h>
3434
#include <stdarg.h>
35+
#include <ctype.h>
3536
#include <signal.h>
3637
#if defined(__gnu_linux__)
3738
#include <syscall.h>
@@ -3006,6 +3007,32 @@ struct ggml_tensor * ggml_l2_norm_inplace(
30063007
return ggml_l2_norm_impl(ctx, a, eps, true);
30073008
}
30083009

3010+
static int get_env_as_bool(const char *name) {
3011+
const char *val = getenv(name);
3012+
if (val == NULL) {
3013+
return 0;
3014+
}
3015+
3016+
char buf[64];
3017+
size_t len = strlen(val);
3018+
if (len >= sizeof(buf)) {
3019+
len = sizeof(buf) - 1;
3020+
}
3021+
for (size_t i = 0; i < len; i++) {
3022+
buf[i] = (char)tolower((unsigned char)val[i]);
3023+
}
3024+
buf[len] = '\0';
3025+
3026+
const char *truthy[] = {"on", "1", "yes", "y", "enable", "true"};
3027+
for (size_t i = 0; i < sizeof(truthy) / sizeof(truthy[0]); i++) {
3028+
if (strcmp(buf, truthy[i]) == 0) {
3029+
return 1; // true
3030+
}
3031+
}
3032+
3033+
return 0; // false
3034+
}
3035+
30093036
// ggml_mul_mat
30103037

30113038
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
@@ -3024,7 +3051,12 @@ struct ggml_tensor * ggml_mul_mat(
30243051
GGML_ASSERT(!ggml_is_transposed(a));
30253052

30263053
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
3027-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3054+
struct ggml_tensor * result;
3055+
if(get_env_as_bool("GGML_CANN_HIGH_PERF_MODE") && b->type == GGML_TYPE_F16){
3056+
result = ggml_new_tensor(ctx, b->type, 4, ne);
3057+
} else {
3058+
result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
3059+
}
30283060

30293061
result->op = GGML_OP_MUL_MAT;
30303062
result->src[0] = a;
@@ -3629,6 +3661,9 @@ struct ggml_tensor * ggml_get_rows(
36293661

36303662
// TODO: implement non F32 return
36313663
enum ggml_type type = GGML_TYPE_F32;
3664+
if(get_env_as_bool("GGML_CANN_HIGH_PERF_MODE") && a->type == GGML_TYPE_F16){
3665+
type = a->type;
3666+
}
36323667
if (a->type == GGML_TYPE_I32) {
36333668
type = a->type;
36343669
}
@@ -3676,7 +3711,7 @@ struct ggml_tensor * ggml_set_rows(
36763711
GGML_ASSERT(b->ne[2] % c->ne[1] == 0);
36773712
GGML_ASSERT(b->ne[3] % c->ne[2] == 0);
36783713
GGML_ASSERT(c->ne[3] == 1);
3679-
GGML_ASSERT(b->type == GGML_TYPE_F32);
3714+
// GGML_ASSERT(b->type == GGML_TYPE_F32);
36803715
GGML_ASSERT(c->type == GGML_TYPE_I64);
36813716

36823717
GGML_ASSERT(ggml_is_contiguous_rows(a));
@@ -5003,7 +5038,13 @@ struct ggml_tensor * ggml_flash_attn_ext(
50035038

50045039
// permute(0, 2, 1, 3)
50055040
int64_t ne[4] = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] };
5006-
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5041+
5042+
struct ggml_tensor * result;
5043+
if(get_env_as_bool("GGML_CANN_HIGH_PERF_MODE") && q->type == GGML_TYPE_F16){
5044+
result = ggml_new_tensor(ctx, q->type, 4, ne);
5045+
} else {
5046+
result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
5047+
}
50075048

50085049
float params[] = { scale, max_bias, logit_softcap };
50095050
ggml_set_op_params(result, params, sizeof(params));

src/llama-model.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8530,6 +8530,9 @@ struct llm_build_qwen2 : public llm_graph_context {
85308530

85318531
// lm_head
85328532
cur = build_lora_mm(model.output, cur);
8533+
if (cur->type != GGML_TYPE_F32) {
8534+
cur = ggml_cast(ctx0 ,cur, GGML_TYPE_F32);
8535+
}
85338536

85348537
if (model.output_b != nullptr) {
85358538
cur = ggml_add(ctx0, cur, model.output_b);

0 commit comments

Comments
 (0)