From b43d89e311c5e7fbf62e5ec3c0401eb536677267 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Wed, 16 Apr 2025 16:21:05 +0800 Subject: [PATCH 1/3] CANN: Add 310P operator support check (#12962) --- ggml/src/ggml-cann/aclnn_ops.cpp | 8 ++++++++ ggml/src/ggml-cann/ggml-cann.cpp | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 2c5cdcae32cc8..2c6737ea8cf3f 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -625,6 +625,10 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, bool count_include_pad = true; int64_t divisor_override = 0; int8_t cube_math_type = 0; +#ifdef ASCEND_310P + cube_math_type = 1; +#endif + GGML_CANN_CALL_ACLNN_OP(AvgPool2d, acl_src, kernel_size, strides, paddings_avg, ceil_mode, count_include_pad, divisor_override, cube_math_type, acl_dst); @@ -2590,6 +2594,10 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds int64_t groups = 1; int8_t cubeMathType = 0; +#ifdef ASCEND_310P + cubeMathType = 1; +#endif + GGML_CANN_CALL_ACLNN_OP(Convolution, acl_input, acl_weight, nullptr, stride, padding, dilation, transposed, padding, groups, acl_dst, cubeMathType); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 08b9ca301c617..ca41e02607091 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2022,6 +2022,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, return true; case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: +#ifdef ASCEND_310P + // Q4 && Q8 per group is not suppor on 310p device + return false; +#endif // only support contiguous for quantized types. return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); @@ -2107,6 +2111,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, } case GGML_OP_POOL_2D: { const int32_t * opts = (const int32_t *) op->op_params; +#ifdef ASCEND_310P + enum ggml_op_pool opt = static_cast(opts[0]); + if(opt == GGML_OP_POOL_MAX){ + return false; + } +#endif const int k0 = opts[1]; const int k1 = opts[2]; const int p0 = opts[5]; From 015022bb53387baa8b23817ac03743705c7d472b Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 16 Apr 2025 13:37:25 -0500 Subject: [PATCH 2/3] vulkan: enable coopmat2 FA gqa and split_k optimizations more often (#12931) The grouped query attention optmization doesn't require a power of two ratio, the only thing relying on it was the modulo operation written as bitwise &. split_k need not depend on gqa_ratio - enable it any time there's only one workgroup in the X dimension. The shader gets the split index from the x coord, and multiple workgroups in the X dimension (pre-split) indicates a larger FA operation that wouldn't need splitting. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 +++--- ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp | 2 +- tests/test-backend-ops.cpp | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 783a0ff86c1c1..0e9b2e8135a7a 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5531,7 +5531,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx uint32_t workgroups_y = (uint32_t)neq2; uint32_t workgroups_z = (uint32_t)neq3; - if (N == 1 && qk_ratio > 1 && is_pow2(qk_ratio) && gqa_ratio <= flash_attention_num_small_rows && + if (N == 1 && qk_ratio > 1 && gqa_ratio <= flash_attention_num_small_rows && qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) { // grouped query attention - make the N dimension equal to gqa_ratio, reduce // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1 @@ -5544,8 +5544,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx uint32_t split_kv = KV; uint32_t split_k = 1; - if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) { - GGML_ASSERT(workgroups_x == 1); + // Try to use split_k when KV is large enough to be worth the overhead + if (workgroups_x == 1 && ctx->device->shader_core_count > 0 && KV >= 512) { // Try to run two workgroups per SM. split_k = ctx->device->shader_core_count * 2 / workgroups_y; if (split_k > 1) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index e1baa85f9e330..b926a578aded6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -131,7 +131,7 @@ ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in A // Load the slope matrix, indexed by Q's dimension 2. ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2) { - const uint32_t h = iq2 + (r & (p.gqa_ratio - 1)); + const uint32_t h = iq2 + (r % p.gqa_ratio); const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1); const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 3a5741c8d959d..1ee742894695b 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4532,7 +4532,9 @@ static std::vector> make_test_cases_perf() { for (int kv : { 4096, 8192, 16384, }) { for (int hs : { 64, 128, }) { - test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16)); + for (int nr : { 1, 4, }) { + test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, nr, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16)); + } } } From 12b17501e6015ffe568ac54fdf08e6580833bf1b Mon Sep 17 00:00:00 2001 From: kimminsu <80271594+kimminsu38oo@users.noreply.github.com> Date: Thu, 17 Apr 2025 06:25:57 +0900 Subject: [PATCH 3/3] opencl: fix incorrect local_size index in profiling log (#12868) --- ggml/src/ggml-opencl/ggml-opencl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index fa40abc33e624..05a2f4e630a56 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -1521,7 +1521,7 @@ static void ggml_cl2_free(void) { info.cmd_complete_duration_ns/1.e6f, info.cmd_total_duration_ns/1.e6f, info.global_size[0], info.global_size[1], info.global_size[2], - info.local_size[0], info.local_size[2], info.local_size[2], + info.local_size[0], info.local_size[1], info.local_size[2], info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); } fclose(fperf);