Skip to content

Commit ffe5646

Browse files
authored
Merge pull request #225 from menloresearch/update-dev-from-master-2025-08-30-10-15
Sync master with upstream release b6323
2 parents e613356 + ad5f187 commit ffe5646

File tree

5 files changed

+24
-26
lines changed

5 files changed

+24
-26
lines changed

docs/function-calling.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
2121
- Use `--chat-template-file` to override the template when appropriate (see examples below)
2222
- Generic support may consume more tokens and be less efficient than a model's native format.
2323

24+
- Multiple/parallel tool calling is supported on some models but disabled by default, enable it by passing `"parallel_tool_calls": true` in the completion endpoint payload.
25+
2426
<details>
2527
<summary>Show some common templates and which format handler they use</summary>
2628

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,7 +1155,7 @@ namespace {
11551155
* @note The workspace buffer used in this function is managed globally and reused
11561156
* across calls. This reduces overhead from repeated memory allocation and deallocation.
11571157
*/
1158-
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
1158+
static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
11591159
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
11601160
tensor->nb, 2, ACL_FORMAT_ND, offset);
11611161
uint64_t workspaceSize = 0;
@@ -1203,7 +1203,7 @@ static void ggml_backend_cann_buffer_set_tensor(
12031203
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
12041204
GGML_ASSERT(tensor->ne[2] == 1);
12051205
GGML_ASSERT(tensor->ne[3] == 1);
1206-
weight_format_to_nz(tensor, data, offset);
1206+
weight_format_to_nz(tensor, offset);
12071207
}
12081208
} else {
12091209
void *transform_buffer = malloc(size);
@@ -2491,7 +2491,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
24912491
return true;
24922492
case GGML_OP_SCALE:
24932493
float bias;
2494-
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
2494+
memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
24952495
return bias == 0.0f; // TODO: support bias != 0.0f
24962496
case GGML_OP_SOFT_MAX:
24972497
// TODO: support attention sinks [TAG_ATTN_SINKS]
@@ -2534,7 +2534,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
25342534
return false;
25352535
}
25362536
float logitSoftcap = 0.0f;
2537-
memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
2537+
memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float));
25382538
if(logitSoftcap != 0.0f) {
25392539
return false;
25402540
}

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5800,11 +5800,6 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
58005800
ggml_vk_sync_buffers(ctx, subctx);
58015801
}
58025802
}
5803-
if (y_non_contig || quantize_y) {
5804-
if (ctx->prealloc_y_need_sync) {
5805-
ggml_vk_sync_buffers(ctx, subctx);
5806-
}
5807-
}
58085803

58095804
if (x_non_contig) {
58105805
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
@@ -5816,6 +5811,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
58165811
if (y_non_contig) {
58175812
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
58185813
ctx->prealloc_y_last_tensor_used != src1) {
5814+
if (ctx->prealloc_y_need_sync) {
5815+
ggml_vk_sync_buffers(ctx, subctx);
5816+
}
58195817
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
58205818
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
58215819
ctx->prealloc_y_last_tensor_used = src1;
@@ -5824,6 +5822,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
58245822
if (quantize_y) {
58255823
if (ctx->prealloc_y_last_pipeline_used != to_q8_1.get() ||
58265824
ctx->prealloc_y_last_tensor_used != src1) {
5825+
if (ctx->prealloc_y_need_sync) {
5826+
ggml_vk_sync_buffers(ctx, subctx);
5827+
}
58275828
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
58285829
ctx->prealloc_y_last_pipeline_used = to_q8_1.get();
58295830
ctx->prealloc_y_last_tensor_used = src1;
@@ -6008,11 +6009,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
60086009
ggml_vk_sync_buffers(ctx, subctx);
60096010
}
60106011
}
6011-
if (y_non_contig) {
6012-
if (ctx->prealloc_y_need_sync) {
6013-
ggml_vk_sync_buffers(ctx, subctx);
6014-
}
6015-
}
60166012

60176013
if (x_non_contig) {
60186014
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
@@ -6022,6 +6018,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
60226018
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
60236019
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
60246020
ctx->prealloc_y_last_tensor_used != src1) {
6021+
if (ctx->prealloc_y_need_sync) {
6022+
ggml_vk_sync_buffers(ctx, subctx);
6023+
}
60256024
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
60266025
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
60276026
ctx->prealloc_y_last_tensor_used = src1;
@@ -6454,11 +6453,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
64546453
ggml_vk_sync_buffers(ctx, subctx);
64556454
}
64566455
}
6457-
if (y_non_contig) {
6458-
if (ctx->prealloc_y_need_sync) {
6459-
ggml_vk_sync_buffers(ctx, subctx);
6460-
}
6461-
}
64626456

64636457
if (x_non_contig) {
64646458
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
@@ -6471,6 +6465,9 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
64716465
if (y_non_contig) {
64726466
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
64736467
ctx->prealloc_y_last_tensor_used != src1) {
6468+
if (ctx->prealloc_y_need_sync) {
6469+
ggml_vk_sync_buffers(ctx, subctx);
6470+
}
64746471
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
64756472
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
64766473
ctx->prealloc_y_last_tensor_used = src1;
@@ -6668,11 +6665,6 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
66686665
ggml_vk_sync_buffers(ctx, subctx);
66696666
}
66706667
}
6671-
if (y_non_contig) {
6672-
if (ctx->prealloc_y_need_sync) {
6673-
ggml_vk_sync_buffers(ctx, subctx);
6674-
}
6675-
}
66766668

66776669
if (x_non_contig) {
66786670
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
@@ -6682,6 +6674,9 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
66826674
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
66836675
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() ||
66846676
ctx->prealloc_y_last_tensor_used != src1) {
6677+
if (ctx->prealloc_y_need_sync) {
6678+
ggml_vk_sync_buffers(ctx, subctx);
6679+
}
66856680
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
66866681
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get();
66876682
ctx->prealloc_y_last_tensor_used = src1;

scripts/compare-llama-bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
DEFAULT_SHOW_TEST_BACKEND_OPS = ["backend_name", "op_name"] # Always show these properties by default.
9797
DEFAULT_HIDE_TEST_BACKEND_OPS = ["error_message"] # Always hide these properties by default.
9898

99-
GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables.
99+
GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon ", "AMD Instinct "] # Strip prefixes for smaller tables.
100100
MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
101101

102102
DESCRIPTION = """Creates tables from llama-bench or test-backend-ops data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):

tools/server/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ The project is under active development, and we are [looking for feedback and co
6262
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
6363
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
6464
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
65-
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
6665
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
6766
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
6867
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
@@ -1143,6 +1142,8 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
11431142

11441143
`parse_tool_calls`: Whether to parse the generated tool call.
11451144

1145+
`parallel_tool_calls` : Whether to enable parallel/multiple tool calls (only supported on some models, verification is based on jinja template).
1146+
11461147
*Examples:*
11471148

11481149
You can use either Python `openai` library with appropriate checkpoints:

0 commit comments

Comments
 (0)