Skip to content

Commit 1a3bfec

Browse files
committed
Merge branch 'master' into flash-attn-cann
2 parents 89f884e + 9ecf3e6 commit 1a3bfec

File tree

16 files changed

+472
-186
lines changed

16 files changed

+472
-186
lines changed

ggml/include/ggml.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -528,15 +528,15 @@ extern "C" {
528528
GGML_UNARY_OP_STEP,
529529
GGML_UNARY_OP_TANH,
530530
GGML_UNARY_OP_ELU,
531+
GGML_UNARY_OP_RELU,
531532
GGML_UNARY_OP_SIGMOID,
532533
GGML_UNARY_OP_GELU,
533-
GGML_UNARY_OP_GELU_ERF,
534534
GGML_UNARY_OP_GELU_QUICK,
535535
GGML_UNARY_OP_SILU,
536536
GGML_UNARY_OP_HARDSWISH,
537537
GGML_UNARY_OP_HARDSIGMOID,
538538
GGML_UNARY_OP_EXP,
539-
GGML_UNARY_OP_RELU,
539+
GGML_UNARY_OP_GELU_ERF,
540540

541541
GGML_UNARY_OP_COUNT,
542542
};

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 133 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2700,14 +2700,10 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
27002700
}
27012701
}
27022702

2703-
// GroupedMatmulV2 required tensor_list.size < 128
27042703
size_t GROUP_SIZE = 128;
2705-
std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
2706-
std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
2707-
std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
2708-
2709-
// split and call GroupedMatmulV2
2704+
// GroupedMatmulV2 required tensor_list.size < 128
27102705
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
2706+
// split and call GroupedMatmulV2
27112707
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
27122708
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
27132709
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
@@ -2725,13 +2721,144 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
27252721
return;
27262722
}
27272723

2724+
/**
2725+
* @brief Performs expert-specific matrix multiplication (MoE) with
2726+
* quantized precision using the CANN backend.
2727+
*
2728+
* This function executes a matrix multiplication operation tailored for
2729+
* Mixture of Experts (MoE) models, where the input tensor is multiplied
2730+
* with expert-specific quantized weight matrices. It leverages the CANN
2731+
* backend to perform efficient low-precision computations and stores the
2732+
* quantized result in the destination tensor `dst`.
2733+
*
2734+
* Quantization techniques reduce memory footprint and improve performance
2735+
* by using lower-bit representations (e.g., int8) instead of floating-point.
2736+
* This function is designed to work with such formats and may incorporate
2737+
* optimizations like identity-based fast paths or routing masks for sparse
2738+
* expert selection.
2739+
*
2740+
* @param ctx The context for executing CANN backend operations.
2741+
* @param dst The destination tensor where the quantized MoE multiplication result
2742+
* will be stored.
2743+
*
2744+
* @note This function assumes quantized data types and is designed for
2745+
* MoE architectures with potential sparse expert routing.
2746+
*/
2747+
static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2748+
// TODO: Use aclnnGroupedMatMul
2749+
//dst [M, K, N, 1]
2750+
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
2751+
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
2752+
ggml_tensor * ids = dst->src[2]; //ids [K, N]
2753+
2754+
GGML_TENSOR_BINARY_OP_LOCALS
2755+
2756+
// copy index from npu to cpu
2757+
int64_t n_as = ne02; // A
2758+
int64_t n_ids = ids->ne[0]; // K
2759+
2760+
std::vector<char> ids_host(ggml_nbytes(ids));
2761+
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
2762+
ACL_MEMCPY_DEVICE_TO_HOST);
2763+
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
2764+
2765+
char * src0_original = (char *) src0->data;
2766+
char * src1_original = (char *) src1->data;
2767+
char * dst_original = (char *) dst->data;
2768+
2769+
ggml_tensor src0_row = *src0;
2770+
ggml_tensor src1_row = *src1;
2771+
ggml_tensor dst_row = *dst;
2772+
2773+
const enum ggml_type type = dst->src[0]->type;
2774+
float weight_elem_size;
2775+
if (type == GGML_TYPE_Q4_0) {
2776+
weight_elem_size = float(sizeof(uint8_t)) / 2;
2777+
} else if (type == GGML_TYPE_Q8_0) {
2778+
weight_elem_size = float(sizeof(uint8_t));
2779+
} else {
2780+
GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
2781+
}
2782+
2783+
// src0_row [D, M, 1, 1] weight without permute
2784+
src0_row.ne[2] = 1;
2785+
src0_row.ne[3] = 1;
2786+
src0_row.nb[0] = weight_elem_size;
2787+
src0_row.nb[1] = weight_elem_size * ne00;
2788+
src0_row.nb[2] = weight_elem_size * ne00;
2789+
src0_row.nb[3] = weight_elem_size * ne00;
2790+
size_t weight_stride = ne00 * ne01 * weight_elem_size;
2791+
size_t weight_size = weight_stride * ne02 * ne03;
2792+
2793+
// scale [D, M, 1, 1] -> scale && permute
2794+
size_t scale_elem_size = sizeof(uint16_t);
2795+
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
2796+
2797+
// src1_row [D, 1, 1, 1] -> input
2798+
src1_row.ne[1] = 1;
2799+
src1_row.ne[2] = 1;
2800+
src1_row.ne[3] = 1;
2801+
src1_row.nb[2] = nb11;
2802+
src1_row.nb[3] = nb11;
2803+
2804+
// dst_row [M, 1, 1, 1] -> out
2805+
dst_row.ne[1] = 1;
2806+
dst_row.ne[2] = 1;
2807+
dst_row.ne[3] = 1;
2808+
dst_row.nb[2] = nb1;
2809+
dst_row.nb[3] = nb1;
2810+
2811+
//create weight for one row
2812+
ggml_cann_pool_alloc weight_allocator(ctx.pool());
2813+
void* weight_buffer = weight_allocator.alloc(nb02);
2814+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2815+
for (int64_t id = 0; id < n_ids; id++) {
2816+
// expert index
2817+
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2818+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
2819+
2820+
// If B = 1 (broadcast), always use 0; otherwise, use id.
2821+
int64_t i11 = (ne11 == 1 ? 0 : id);
2822+
int64_t i12 = iid1;
2823+
2824+
int64_t i1 = id;
2825+
int64_t i2 = i12;
2826+
2827+
void* src0_tmp_ptr = src0_original + i02*weight_stride;
2828+
void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
2829+
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
2830+
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
2831+
2832+
// mem cpy
2833+
ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
2834+
ACL_MEMCPY_DEVICE_TO_DEVICE);
2835+
void* scale_buffer = (char*)weight_buffer + weight_stride;
2836+
ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
2837+
ACL_MEMCPY_DEVICE_TO_DEVICE);
2838+
2839+
src0_row.data = weight_buffer;
2840+
src1_row.data = src1_tmp_ptr;
2841+
dst_row.data = dst_tmp_ptr;
2842+
dst_row.src[0] = &src0_row;
2843+
dst_row.src[1] = &src1_row;
2844+
2845+
ggml_cann_mul_mat(ctx, &dst_row);
2846+
}
2847+
}
2848+
return;
2849+
}
2850+
27282851
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
27292852
const enum ggml_type type = dst->src[0]->type;
27302853
switch (type) {
27312854
case GGML_TYPE_F32:
27322855
case GGML_TYPE_F16:
27332856
ggml_cann_mul_mat_id_fp(ctx, dst);
27342857
break;
2858+
case GGML_TYPE_Q4_0:
2859+
case GGML_TYPE_Q8_0:
2860+
ggml_cann_mul_mat_id_quant(ctx, dst);
2861+
break;
27352862
default:
27362863
GGML_ABORT("Unsupported type for mul_mat_id");
27372864
break;

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,6 +2039,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
20392039
case GGML_TYPE_F16:
20402040
case GGML_TYPE_F32:
20412041
return true;
2042+
case GGML_TYPE_Q8_0:
2043+
case GGML_TYPE_Q4_0:
2044+
#ifdef ASCEND_310P
2045+
// Q4 && Q8 per group is not suppor on 310p device
2046+
return false;
2047+
#endif
2048+
// only support contiguous for quantized types.
2049+
return ggml_is_contiguous(op->src[0]) &&
2050+
ggml_is_contiguous(op->src[1]);
20422051
default:
20432052
return false;
20442053
}

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2804,23 +2804,29 @@ static vk_device ggml_vk_get_device(size_t idx) {
28042804
pipeline_robustness = true;
28052805
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
28062806
device->subgroup_size_control = true;
2807+
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
28072808
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
28082809
!getenv("GGML_VK_DISABLE_COOPMAT")) {
28092810
device->coopmat_support = true;
28102811
device->coopmat_m = 0;
28112812
device->coopmat_n = 0;
28122813
device->coopmat_k = 0;
2814+
#endif
2815+
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
28132816
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
28142817
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
28152818
coopmat2_support = true;
2819+
#endif
28162820
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
28172821
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
28182822
!getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
28192823
device->integer_dot_product = true;
28202824
#endif
2825+
#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
28212826
} else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 &&
28222827
!getenv("GGML_VK_DISABLE_BFLOAT16")) {
28232828
bfloat16_support = true;
2829+
#endif
28242830
}
28252831
}
28262832

@@ -4670,6 +4676,19 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
46704676
}
46714677
}
46724678

4679+
if (src->type == to) {
4680+
// Copy two or four bytes at a time, depending on block size.
4681+
// For quantized types, we scale by block size/type size. But
4682+
// this path is also used for bf16->bf16 for example, where the
4683+
// type size must be exactly 2 or 4.
4684+
GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4);
4685+
if ((ggml_type_size(src->type) % 4) == 0) {
4686+
return ctx->device->pipeline_contig_cpy_f32_f32;
4687+
} else {
4688+
return ctx->device->pipeline_contig_cpy_f16_f16;
4689+
}
4690+
}
4691+
46734692
std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
46744693
GGML_ABORT("fatal error");
46754694
}
@@ -6731,7 +6750,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
67316750
case GGML_OP_UNARY:
67326751
case GGML_OP_CONV_2D_DW:
67336752
{
6734-
const uint32_t ne = ggml_nelements(dst);
6753+
uint32_t ne = ggml_nelements(dst);
6754+
if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
6755+
// Convert from number of logical elements to 2- or 4-byte units.
6756+
ne /= ggml_blck_size(src0->type);
6757+
if ((ggml_type_size(src0->type) % 4) == 0) {
6758+
ne *= ggml_type_size(src0->type) / 4;
6759+
} else {
6760+
ne *= ggml_type_size(src0->type) / 2;
6761+
}
6762+
}
67356763
if (ne > 262144) {
67366764
elements = { 512, 512, CEIL_DIV(ne, 262144) };
67376765
} else if (ne > 512) {
@@ -7281,8 +7309,19 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
72817309
const uint32_t src0_type_size = ggml_type_size(src0->type);
72827310
const uint32_t dst_type_size = ggml_type_size(dst->type);
72837311

7312+
uint32_t ne = (uint32_t)ggml_nelements(src0);
7313+
if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
7314+
// Convert from number of logical elements to 2- or 4-byte units.
7315+
ne /= ggml_blck_size(src0->type);
7316+
if ((ggml_type_size(src0->type) % 4) == 0) {
7317+
ne *= ggml_type_size(src0->type) / 4;
7318+
} else {
7319+
ne *= ggml_type_size(src0->type) / 2;
7320+
}
7321+
}
7322+
72847323
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
7285-
(uint32_t)ggml_nelements(src0),
7324+
ne,
72867325
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
72877326
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
72887327
0,
@@ -9264,8 +9303,7 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
92649303
try {
92659304
ptr = ggml_vk_host_malloc(vk_instance.devices[0], size);
92669305
} catch (vk::SystemError& e) {
9267-
std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
9268-
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
9306+
GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what());
92699307
// fallback to cpu buffer
92709308
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
92719309
}
@@ -9867,6 +9905,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
98679905
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
98689906
return true;
98699907
}
9908+
9909+
// We can handle copying from a type to the same type if it's
9910+
// contiguous (memcpy). We use f16 or f32 shaders to do the copy,
9911+
// so the type/block size must be a multiple of 4.
9912+
if (src0_type == src1_type &&
9913+
ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op) &&
9914+
(ggml_type_size(src0_type) % 2) == 0) {
9915+
return true;
9916+
}
98709917
return false;
98719918
} break;
98729919
case GGML_OP_REPEAT:

tools/mtmd/mtmd-helper.cpp

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,7 @@ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
1212
size_t n_tokens = 0;
1313
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
1414
auto chunk = mtmd_input_chunks_get(chunks, i);
15-
auto chunk_type = mtmd_input_chunk_get_type(chunk);
16-
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
17-
size_t n_tokens_text;
18-
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
19-
n_tokens += n_tokens_text;
20-
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
21-
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
22-
n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
23-
} else {
24-
GGML_ASSERT(false && "chunk type not supported");
25-
}
15+
n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
2616
}
2717
return n_tokens;
2818
}
@@ -31,17 +21,7 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
3121
llama_pos n_pos = 0;
3222
for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
3323
auto chunk = mtmd_input_chunks_get(chunks, i);
34-
auto chunk_type = mtmd_input_chunk_get_type(chunk);
35-
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
36-
size_t n_tokens_text;
37-
mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
38-
n_pos += n_tokens_text;
39-
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
40-
auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
41-
n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
42-
} else {
43-
GGML_ASSERT(false && "chunk type not supported");
44-
}
24+
n_pos += mtmd_input_chunk_get_n_pos(chunk);
4525
}
4626
return n_pos;
4727
}

tools/mtmd/mtmd.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,6 +751,10 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
751751
return bitmap->data.data();
752752
}
753753

754+
size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
755+
return bitmap->data.size();
756+
}
757+
754758
bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
755759
return bitmap->is_audio;
756760
}

tools/mtmd/mtmd.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,12 @@ MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
119119
// the data is in float format (PCM F32)
120120
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
121121
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
122-
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
123-
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
124-
MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
125-
MTMD_API bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap);
126-
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
122+
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
123+
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
124+
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
125+
MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
126+
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
127+
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
127128
// bitmap ID is optional, but useful for KV cache tracking
128129
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
129130
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
@@ -322,6 +323,7 @@ struct bitmap {
322323
uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
323324
uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
324325
const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
326+
size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
325327
std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
326328
void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
327329
};

tools/server/public/index.html.gz

528 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)