Skip to content

Commit 3ecd8ce

Browse files
Merge pull request #272 from menloresearch/update-dev-from-master-2025-10-04-00-30
Sync master with upstream release b6686
2 parents 02010ec + 128d522 commit 3ecd8ce

File tree

23 files changed

+455
-243
lines changed

23 files changed

+455
-243
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ jobs:
9797
ctest -L 'main|curl' --verbose --timeout 900
9898
9999
macOS-latest-cmake-x64:
100-
runs-on: macos-13
100+
runs-on: macos-15-intel
101101

102102
steps:
103103
- name: Clone

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ jobs:
7575
name: llama-bin-macos-arm64.zip
7676

7777
macOS-x64:
78-
runs-on: macos-13
78+
runs-on: macos-15-intel
7979

8080
steps:
8181
- name: Clone

common/arg.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1932,13 +1932,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19321932
}
19331933
).set_env("LLAMA_ARG_SWA_FULL"));
19341934
add_opt(common_arg(
1935-
{"--swa-checkpoints"}, "N",
1936-
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
1937-
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
1935+
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
1936+
string_format("max number of context checkpoints to create per slot (default: %d)\n"
1937+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
19381938
[](common_params & params, int value) {
1939-
params.n_swa_checkpoints = value;
1939+
params.n_ctx_checkpoints = value;
19401940
}
1941-
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1941+
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
19421942
add_opt(common_arg(
19431943
{"--kv-unified", "-kvu"},
19441944
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"

common/chat.cpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,7 @@ const char * common_chat_format_name(common_chat_format format) {
625625
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
626626
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
627627
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
628+
case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
628629
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
629630
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
630631
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
@@ -984,6 +985,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
984985
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
985986
return data;
986987
}
988+
989+
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
990+
common_chat_params data;
991+
data.prompt = apply(tmpl, inputs);
992+
data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
993+
data.preserved_tokens = {
994+
"[THINK]",
995+
"[/THINK]",
996+
};
997+
998+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
999+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1000+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1001+
auto schemas = json::array();
1002+
foreach_function(inputs.tools, [&](const json & tool) {
1003+
const auto & function = tool.at("function");
1004+
schemas.push_back({
1005+
{"type", "object"},
1006+
{"properties", {
1007+
{"name", {
1008+
{"type", "string"},
1009+
{"const", function.at("name")},
1010+
}},
1011+
{"arguments", function.at("parameters")},
1012+
{"id", {
1013+
{"type", "string"},
1014+
{"pattern", "^[a-zA-Z0-9]{9}$"},
1015+
}},
1016+
}},
1017+
{"required", json::array({"name", "arguments", "id"})},
1018+
});
1019+
});
1020+
auto schema = json {
1021+
{"type", "array"},
1022+
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
1023+
{"minItems", 1},
1024+
};
1025+
if (!inputs.parallel_tool_calls) {
1026+
schema["maxItems"] = 1;
1027+
}
1028+
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
1029+
});
1030+
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
1031+
data.preserved_tokens.push_back("[TOOL_CALLS]");
1032+
} else {
1033+
data.grammar_lazy = false;
1034+
if (!inputs.json_schema.is_null()) {
1035+
if (!inputs.grammar.empty()) {
1036+
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
1037+
}
1038+
data.grammar = json_schema_to_grammar(inputs.json_schema);
1039+
} else {
1040+
data.grammar = inputs.grammar;
1041+
}
1042+
}
1043+
1044+
return data;
1045+
}
1046+
9871047
static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
9881048
if (!builder.syntax().parse_tool_calls) {
9891049
builder.add_content(builder.consume_rest());
@@ -994,6 +1054,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
9941054
parse_prefixed_json_tool_call_array(builder, prefix);
9951055
}
9961056

1057+
static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
1058+
builder.try_parse_reasoning("[THINK]", "[/THINK]");
1059+
1060+
if (!builder.syntax().parse_tool_calls) {
1061+
builder.add_content(builder.consume_rest());
1062+
return;
1063+
}
1064+
1065+
static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
1066+
parse_prefixed_json_tool_call_array(builder, prefix);
1067+
}
1068+
9971069
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
9981070
common_chat_params data;
9991071

@@ -2702,6 +2774,10 @@ static common_chat_params common_chat_templates_apply_jinja(
27022774
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
27032775
}
27042776

2777+
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
2778+
return common_chat_params_init_magistral(tmpl, params);
2779+
}
2780+
27052781
// Plain handler (no tools)
27062782
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
27072783
return common_chat_params_init_without_tools(tmpl, params);
@@ -2802,6 +2878,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
28022878
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
28032879
common_chat_parse_mistral_nemo(builder);
28042880
break;
2881+
case COMMON_CHAT_FORMAT_MAGISTRAL:
2882+
common_chat_parse_magistral(builder);
2883+
break;
28052884
case COMMON_CHAT_FORMAT_LLAMA_3_X:
28062885
common_chat_parse_llama_3_1(builder);
28072886
break;

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ enum common_chat_format {
101101
COMMON_CHAT_FORMAT_CONTENT_ONLY,
102102
COMMON_CHAT_FORMAT_GENERIC,
103103
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
104+
COMMON_CHAT_FORMAT_MAGISTRAL,
104105
COMMON_CHAT_FORMAT_LLAMA_3_X,
105106
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
106107
COMMON_CHAT_FORMAT_DEEPSEEK_R1,

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ struct common_params {
424424
int32_t timeout_write = timeout_read; // http write timeout in seconds
425425
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
426426
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
427-
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
427+
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
428428

429429
std::string hostname = "127.0.0.1";
430430
std::string public_path = ""; // NOLINT

ggml/src/ggml-alloc.c

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -392,12 +392,8 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
392392
free(alloc);
393393
}
394394

395-
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
396-
size_t max_size = 0;
397-
for (int i = 0; i < alloc->n_chunks; i++) {
398-
max_size += alloc->chunks[i]->max_size;
399-
}
400-
return max_size;
395+
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
396+
return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
401397
}
402398

403399

@@ -417,10 +413,8 @@ static void ggml_vbuffer_free(struct vbuffer * buf) {
417413
free(buf);
418414
}
419415

420-
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
421-
int n = 0;
422-
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
423-
return n;
416+
static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
417+
return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
424418
}
425419

426420
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
@@ -885,12 +879,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
885879
}
886880
}
887881

888-
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
889-
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
890-
891882
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
892-
if (new_size > cur_size || galloc->buffers[i] == NULL) {
883+
bool realloc = galloc->buffers[i] == NULL;
884+
size_t new_size = 0;
885+
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
886+
size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
887+
size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
888+
new_size += new_chunk_size;
889+
if (new_chunk_size > cur_chunk_size) {
890+
realloc = true;
891+
}
892+
}
893+
if (realloc) {
893894
#ifndef NDEBUG
895+
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
894896
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
895897
#endif
896898

ggml/src/ggml-metal/ggml-metal-common.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static bool ggml_mem_ranges_add_dst(ggml_mem_ranges_t mrs, const ggml_tensor * t
112112
}
113113

114114
bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
115-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
115+
for (int i = 0; i < GGML_MAX_SRC; i++) {
116116
if (tensor->src[i]) {
117117
ggml_mem_ranges_add_src(mrs, tensor->src[i]);
118118
}
@@ -173,7 +173,7 @@ static bool ggml_mem_ranges_check_dst(ggml_mem_ranges_t mrs, const ggml_tensor *
173173
}
174174

175175
bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
176-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
176+
for (int i = 0; i < GGML_MAX_SRC; i++) {
177177
if (tensor->src[i]) {
178178
if (!ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
179179
return false;

0 commit comments

Comments
 (0)