Skip to content

Commit bc05c26

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 20fe00e + 9961d24 commit bc05c26

File tree

9 files changed

+191
-52
lines changed

9 files changed

+191
-52
lines changed

docs/backend/CANN.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,7 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
314314

315315
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
316316

317+
### GGML_CANN_DISABLE_ACL_GRAPH
318+
319+
When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
320+
This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 70 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@
7070
#include <aclnnop/aclnn_zero.h>
7171
#include <aclnnop/aclnn_index_copy.h>
7272
#include <aclnnop/aclnn_index_select.h>
73+
#include <aclnnop/aclnn_clamp.h>
74+
#include <aclnnop/aclnn_threshold.h>
7375
#include <float.h>
7476

7577
#include <cmath>
@@ -1423,21 +1425,25 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
14231425
* @param start Starting exponent offset.
14241426
* @param stop Stopping exponent offset (exclusive).
14251427
* @param step Step size for the exponent increment.
1428+
* @param dtype Data type for slope tensor.
14261429
*/
14271430
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
1428-
float m, int64_t size, float start, float stop, float step){
1431+
float m, int64_t size, float start, float stop, float step, ggml_type dtype){
1432+
aclDataType acl_type = ggml_cann_type_mapping(dtype);
1433+
size_t type_size = ggml_type_size(dtype);
1434+
14291435
int64_t ne[] = {size};
1430-
size_t nb[] = {sizeof(uint16_t)};
1436+
size_t nb[] = {type_size};
14311437

1432-
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(uint16_t));
1438+
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
14331439
void* arange_buffer = arange_allocator.get();
14341440

14351441
aclTensor* arange_tensor = ggml_cann_create_tensor(
1436-
arange_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
1442+
arange_buffer, acl_type, type_size, ne, nb, 1);
14371443
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
14381444

14391445
aclTensor* slope_tensor = ggml_cann_create_tensor(
1440-
slope_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
1446+
slope_buffer, acl_type, type_size, ne, nb, 1);
14411447

14421448
aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
14431449

@@ -1468,10 +1474,11 @@ static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_bu
14681474
* @param n_head Total number of attention heads.
14691475
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
14701476
* @param max_bias Maximum bias value for slope computation.
1477+
* @param dtype Data type for slope tensor.
14711478
*
14721479
*/
14731480
static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
1474-
void* slope_buffer, float max_bias) {
1481+
void* slope_buffer, float max_bias, ggml_type dtype) {
14751482
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
14761483

14771484
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
@@ -1488,7 +1495,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
14881495
float step = 1;
14891496
float count = n_head_log2;
14901497
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
1491-
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
1498+
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step, dtype);
14921499
if (n_head_log2 < n_head) {
14931500
// arange2
14941501
start = 2 * (n_head_log2 - n_head_log2) + 1;
@@ -1497,7 +1504,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
14971504
count = n_head - n_head_log2;
14981505
aclnn_get_slope_inner(
14991506
ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
1500-
m1, count, start, end + 1, step);
1507+
m1, count, start, end + 1, step, dtype);
15011508
}
15021509
}
15031510

@@ -1534,7 +1541,7 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
15341541
ggml_cann_pool_alloc bias_allocator(
15351542
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
15361543
bias_buffer = bias_allocator.get();
1537-
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
1544+
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
15381545
}
15391546

15401547
// broadcast for mask, slop and dst;
@@ -2263,6 +2270,7 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
22632270
*/
22642271
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
22652272
void* sin_tensor_buffer, void* cos_tensor_buffer,
2273+
float* corr_dims, float ext_factor,
22662274
float theta_scale, float freq_scale,
22672275
float attn_factor, bool is_neox) {
22682276
// int sin/cos cache, cache has different repeat method depond on
@@ -2318,16 +2326,60 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
23182326
float n_elements = theta_scale_length;
23192327
aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
23202328

2329+
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
2330+
aclTensor* acl_yarn_ramp_tensor = nullptr;
2331+
if (ext_factor != 0) {
2332+
// -rope_yarn_ramp
2333+
// const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
2334+
// return MIN(1, MAX(0, y)) - 1;
2335+
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
2336+
void* yarn_ramp_buffer = yarn_ramp_allocator.get();
2337+
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float_t),
2338+
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2339+
float zero_value = 0, one_value = 1;
2340+
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
2341+
aclScalar* low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);
2342+
aclScalar* zero = aclCreateScalar(&zero_value, aclDataType::ACL_FLOAT);
2343+
aclScalar* one = aclCreateScalar(&one_value, aclDataType::ACL_FLOAT);
2344+
aclScalar* denom_safe = aclCreateScalar(&denom_safe_value, aclDataType::ACL_FLOAT);
2345+
aclScalar* ext_factor_sc = aclCreateScalar(&ext_factor, aclDataType::ACL_FLOAT);
2346+
2347+
GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor, low, one, acl_yarn_ramp_tensor);
2348+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor, denom_safe);
2349+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor, zero, zero);
2350+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor, one);
2351+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor, one, one);
2352+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor, ext_factor_sc);
2353+
2354+
// theta_interp = freq_scale * theta_extrap;
2355+
// theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
2356+
// theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
2357+
// theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
2358+
// theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
2359+
//
2360+
// we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
2361+
// cache freq_scale + (freq_scale - 1) * ramp_mix
2362+
float freq_scale_1 = freq_scale - 1;
2363+
aclScalar* freq_scale_sc = aclCreateScalar(&freq_scale, aclDataType::ACL_FLOAT);
2364+
aclScalar* freq_scale_1_sc = aclCreateScalar(&freq_scale_1, aclDataType::ACL_FLOAT);
2365+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor, freq_scale_1_sc);
2366+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor, freq_scale_sc, one);
2367+
2368+
ggml_cann_release_resources(ctx, low, zero, one, denom_safe, ext_factor_sc, freq_scale_sc, freq_scale_1_sc);
2369+
}
2370+
23212371
// power
23222372
aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
23232373
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
23242374
acl_theta_scale_tensor);
23252375

2326-
// freq_scale
2327-
if (freq_scale != 1) {
2376+
if (ext_factor != 0) {
2377+
aclnn_mul(ctx, acl_theta_scale_tensor, acl_yarn_ramp_tensor);
2378+
} else if (freq_scale != 1) {
23282379
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
23292380
}
2330-
ggml_cann_release_resources(ctx, acl_theta_scale);
2381+
2382+
ggml_cann_release_resources(ctx, acl_yarn_ramp_tensor, acl_theta_scale);
23312383
} else {
23322384
// use cache
23332385
acl_theta_scale_tensor =
@@ -2385,6 +2437,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
23852437
GGML_MAX_DIMS, ACL_FORMAT_ND);
23862438
aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor);
23872439

2440+
if (ext_factor != 0) {
2441+
attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
2442+
}
2443+
23882444
// attn_factor
23892445
if (attn_factor != 1) {
23902446
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
@@ -2465,8 +2521,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
24652521
// TODO: n_dims <= ne0
24662522
GGML_ASSERT(n_dims == ne0);
24672523
GGML_ASSERT(n_dims % 2 == 0);
2468-
// TODO: ext_factor != 0
2469-
GGML_ASSERT(ext_factor == 0);
24702524

24712525
const float theta_scale = powf(freq_base, -2.0f / n_dims);
24722526

@@ -2484,7 +2538,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
24842538
void *cos_tensor_buffer = cos_tensor_allocator.get();
24852539

24862540
// init ctx.rope_cos/rope_sin cache
2487-
aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer,
2541+
aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer, corr_dims, ext_factor,
24882542
theta_scale, freq_scale, attn_factor, is_neox);
24892543

24902544
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
@@ -3220,7 +3274,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
32203274
const int64_t n_heads = src0->ne[2];
32213275
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
32223276
void* slope_buffer = slope_allocator.get();
3223-
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
3277+
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
32243278

32253279
int64_t slope_ne[] = {1, 1, n_heads, 1};
32263280
size_t slope_nb[GGML_MAX_DIMS];

ggml/src/ggml-cann/common.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ struct ggml_backend_cann_context {
395395
#ifdef USE_ACL_GRAPH
396396
/// Cached CANN ACL graph used for executing the current ggml computation graph.
397397
std::unique_ptr<ggml_cann_graph> cann_graph;
398+
bool acl_graph_mode = true;
398399
#endif
399400
cann_task_queue task_queue;
400401
bool async_mode;
@@ -404,7 +405,6 @@ struct ggml_backend_cann_context {
404405
ggml_cann_tensor_cache rms_norm_one_tensor_cache;
405406
ggml_cann_tensor_cache rms_norm_zero_tensor_cache;
406407

407-
408408
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
409409

410410
/**
@@ -419,6 +419,13 @@ struct ggml_backend_cann_context {
419419
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
420420
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
421421
device, async_mode ? "ON" : "OFF");
422+
#ifdef USE_ACL_GRAPH
423+
acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or("")));
424+
GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
425+
__func__, device,
426+
acl_graph_mode ? "GRAPH" : "EAGER",
427+
acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
428+
#endif
422429
}
423430

424431
/**

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2252,6 +2252,10 @@ static enum ggml_status ggml_backend_cann_graph_compute(
22522252
bool use_cann_graph = true;
22532253
bool cann_graph_update_required = false;
22542254

2255+
if (!cann_ctx->acl_graph_mode) {
2256+
use_cann_graph = false;
2257+
}
2258+
22552259
if (use_cann_graph) {
22562260
if (cann_ctx->cann_graph == nullptr) {
22572261
cann_ctx->cann_graph.reset(new ggml_cann_graph());
@@ -2401,16 +2405,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
24012405
}
24022406
case GGML_OP_ROPE: {
24032407
// TODO: with ops-test v == 1
2404-
float ext_factor = 0.0f;
2405-
memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
24062408
// TODO: n_dims <= ne0
24072409
if (op->src[0]->ne[0] != op->op_params[1]) {
24082410
return false;
24092411
}
2410-
// TODO: ext_factor != 0
2411-
if (ext_factor != 0) {
2412-
return false;
2413-
}
24142412

24152413
const int mode = ((const int32_t *) op->op_params)[2];
24162414
if (mode & GGML_ROPE_TYPE_MROPE) {
@@ -2420,9 +2418,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
24202418
return false;
24212419
}
24222420

2423-
if(!ggml_is_contiguous(op->src[0])){
2424-
return false;
2425-
}
24262421
return true;
24272422
}
24282423
case GGML_OP_UPSCALE: {
@@ -2523,13 +2518,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
25232518
// different head sizes of K and V are not supported yet
25242519
return false;
25252520
}
2526-
if (op->src[0]->ne[0] == 192) {
2527-
return false;
2528-
}
2529-
if (op->src[0]->ne[0] == 576) {
2530-
// DeepSeek MLA
2531-
return false;
2532-
}
25332521
if (op->src[0]->ne[0] % 16 != 0) {
25342522
// TODO: padding to support
25352523
return false;

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2776,10 +2776,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
27762776
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
27772777
case GGML_OP_FLASH_ATTN_EXT:
27782778
{
2779-
if (op->src[4]) {
2780-
return false;
2781-
}
2782-
27832779
const ggml_tensor * q = op->src[0];
27842780
const ggml_tensor * k = op->src[1];
27852781
const ggml_tensor * v = op->src[2];
@@ -5765,13 +5761,17 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
57655761
static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) {
57665762
const ggml_tensor * v = dst->src[2];
57675763
const ggml_tensor * mask = dst->src[3];
5764+
const ggml_tensor * sinks = dst->src[4];
57685765
GGML_ASSERT(q->extra);
57695766
GGML_ASSERT(k->extra);
57705767
GGML_ASSERT(v->extra);
57715768
GGML_ASSERT(dst->extra);
57725769
if (mask) {
57735770
GGML_ASSERT(mask->extra);
57745771
}
5772+
if (sinks) {
5773+
GGML_ASSERT(sinks->extra);
5774+
}
57755775

57765776
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
57775777

@@ -5813,13 +5813,16 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
58135813
ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *)v->extra;
58145814
ggml_tensor_extra_cl * extra_o = (ggml_tensor_extra_cl *)dst->extra;
58155815
ggml_tensor_extra_cl * extra_mask = mask ? (ggml_tensor_extra_cl *)mask->extra : NULL;
5816+
ggml_tensor_extra_cl * extra_sinks = sinks ? (ggml_tensor_extra_cl *)sinks->extra : NULL;
58165817

58175818
cl_ulong offset_q = extra_q->offset + q->view_offs;
58185819
cl_ulong offset_k = extra_k->offset + k->view_offs;
58195820
cl_ulong offset_v = extra_v->offset + v->view_offs;
58205821
cl_ulong offset_o = extra_o->offset + dst->view_offs;
58215822
cl_mem mask_buffer = extra_mask ? extra_mask->data_device : NULL;
58225823
cl_ulong offset_mask = extra_mask ? extra_mask->offset + mask->view_offs : 0;
5824+
cl_mem sinks_buffer = extra_sinks ? extra_sinks->data_device : NULL;
5825+
cl_ulong offset_sinks = extra_sinks ? extra_sinks->offset + sinks->view_offs : 0;
58235826

58245827
const cl_ulong q_nb1 = q->nb[1], q_nb2 = q->nb[2], q_nb3 = q->nb[3];
58255828
const cl_ulong k_nb1 = k->nb[1], k_nb2 = k->nb[2], k_nb3 = k->nb[3];
@@ -5874,6 +5877,8 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
58745877
CL_CHECK(clSetKernelArg(kernel, 35, sizeof(cl_ulong), &mask_nb3));
58755878
CL_CHECK(clSetKernelArg(kernel, 36, sizeof(int), &mask_ne2));
58765879
CL_CHECK(clSetKernelArg(kernel, 37, sizeof(int), &mask_ne3));
5880+
CL_CHECK(clSetKernelArg(kernel, 38, sizeof(cl_mem), &sinks_buffer));
5881+
CL_CHECK(clSetKernelArg(kernel, 39, sizeof(cl_ulong), &offset_sinks));
58775882

58785883
if (n_q == 1) {
58795884
const size_t wg_size = 64;

0 commit comments

Comments
 (0)