Skip to content

Commit a490186

Browse files
committed
Merge branch 'concedo_experimental' into esocrok
2 parents 53cd962 + 1d728bb commit a490186

File tree

21 files changed

+468
-256
lines changed

21 files changed

+468
-256
lines changed

common/arg.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1955,13 +1955,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19551955
}
19561956
).set_env("LLAMA_ARG_SWA_FULL"));
19571957
add_opt(common_arg(
1958-
{"--swa-checkpoints"}, "N",
1959-
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
1960-
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
1958+
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
1959+
string_format("max number of context checkpoints to create per slot (default: %d)\n"
1960+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
19611961
[](common_params & params, int value) {
1962-
params.n_swa_checkpoints = value;
1962+
params.n_ctx_checkpoints = value;
19631963
}
1964-
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1964+
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
19651965
add_opt(common_arg(
19661966
{"--kv-unified", "-kvu"},
19671967
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"

common/chat.cpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,7 @@ const char * common_chat_format_name(common_chat_format format) {
625625
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
626626
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
627627
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
628+
case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
628629
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
629630
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
630631
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
@@ -984,6 +985,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
984985
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
985986
return data;
986987
}
988+
989+
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
990+
common_chat_params data;
991+
data.prompt = apply(tmpl, inputs);
992+
data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
993+
data.preserved_tokens = {
994+
"[THINK]",
995+
"[/THINK]",
996+
};
997+
998+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
999+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1000+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1001+
auto schemas = json::array();
1002+
foreach_function(inputs.tools, [&](const json & tool) {
1003+
const auto & function = tool.at("function");
1004+
schemas.push_back({
1005+
{"type", "object"},
1006+
{"properties", {
1007+
{"name", {
1008+
{"type", "string"},
1009+
{"const", function.at("name")},
1010+
}},
1011+
{"arguments", function.at("parameters")},
1012+
{"id", {
1013+
{"type", "string"},
1014+
{"pattern", "^[a-zA-Z0-9]{9}$"},
1015+
}},
1016+
}},
1017+
{"required", json::array({"name", "arguments", "id"})},
1018+
});
1019+
});
1020+
auto schema = json {
1021+
{"type", "array"},
1022+
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
1023+
{"minItems", 1},
1024+
};
1025+
if (!inputs.parallel_tool_calls) {
1026+
schema["maxItems"] = 1;
1027+
}
1028+
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
1029+
});
1030+
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
1031+
data.preserved_tokens.push_back("[TOOL_CALLS]");
1032+
} else {
1033+
data.grammar_lazy = false;
1034+
if (!inputs.json_schema.is_null()) {
1035+
if (!inputs.grammar.empty()) {
1036+
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
1037+
}
1038+
data.grammar = json_schema_to_grammar(inputs.json_schema);
1039+
} else {
1040+
data.grammar = inputs.grammar;
1041+
}
1042+
}
1043+
1044+
return data;
1045+
}
1046+
9871047
static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
9881048
if (!builder.syntax().parse_tool_calls) {
9891049
builder.add_content(builder.consume_rest());
@@ -994,6 +1054,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
9941054
parse_prefixed_json_tool_call_array(builder, prefix);
9951055
}
9961056

1057+
static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
1058+
builder.try_parse_reasoning("[THINK]", "[/THINK]");
1059+
1060+
if (!builder.syntax().parse_tool_calls) {
1061+
builder.add_content(builder.consume_rest());
1062+
return;
1063+
}
1064+
1065+
static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
1066+
parse_prefixed_json_tool_call_array(builder, prefix);
1067+
}
1068+
9971069
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
9981070
common_chat_params data;
9991071

@@ -2702,6 +2774,10 @@ static common_chat_params common_chat_templates_apply_jinja(
27022774
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
27032775
}
27042776

2777+
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
2778+
return common_chat_params_init_magistral(tmpl, params);
2779+
}
2780+
27052781
// Plain handler (no tools)
27062782
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
27072783
return common_chat_params_init_without_tools(tmpl, params);
@@ -2802,6 +2878,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
28022878
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
28032879
common_chat_parse_mistral_nemo(builder);
28042880
break;
2881+
case COMMON_CHAT_FORMAT_MAGISTRAL:
2882+
common_chat_parse_magistral(builder);
2883+
break;
28052884
case COMMON_CHAT_FORMAT_LLAMA_3_X:
28062885
common_chat_parse_llama_3_1(builder);
28072886
break;

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ enum common_chat_format {
101101
COMMON_CHAT_FORMAT_CONTENT_ONLY,
102102
COMMON_CHAT_FORMAT_GENERIC,
103103
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
104+
COMMON_CHAT_FORMAT_MAGISTRAL,
104105
COMMON_CHAT_FORMAT_LLAMA_3_X,
105106
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
106107
COMMON_CHAT_FORMAT_DEEPSEEK_R1,

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ struct common_params {
420420
int32_t timeout_write = timeout_read; // http write timeout in seconds
421421
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
422422
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
423-
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
423+
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
424424

425425
std::string hostname = "127.0.0.1";
426426
std::string public_path = ""; // NOLINT

embd_res/klite.embd

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
1212
-->
1313

1414
<script id="init-config">
15-
const LITEVER = 285;
15+
const LITEVER = 286;
1616
const urlParams = new URLSearchParams(window.location.search);
1717
var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
1818
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -4282,6 +4282,15 @@ Current version indicated by LITEVER below.
42824282
{
42834283
return str.replace(new RegExp(escapeRegExp(find), (caseInsensitive?'gi':'g')), replace);
42844284
}
4285+
function literalReplace(str, search, replacement) {
4286+
// Handle edge cases
4287+
if (search === null || search === undefined || str === null || str === undefined || search === "") {
4288+
return str;
4289+
}
4290+
const index = str.indexOf(search);
4291+
if (index === -1) { return str; }
4292+
return str.slice(0, index) + replacement + str.slice(index + search.length);
4293+
}
42854294
function rgb_to_hex(rgbColor) { //convert rgb color to hex
42864295
rgbColor = rgbColor.split("(")[1];
42874296
rgbColor = rgbColor.split(")")[0];
@@ -6706,6 +6715,23 @@ Current version indicated by LITEVER below.
67066715
}
67076716
return outp + "</table>";
67086717
};
6718+
function stashLatex(text) { //this function preserves the contents of multiline latex blocks so they don't get corrupted by markdown
6719+
const latexBlocks = [];
6720+
let counter = 0;
6721+
text = text.replace(/(^ {0,6}\\\[\n([\s\S]*?)\n {0,6}\\\] {0,2}$|^\$\$\n([\s\S]*?)\n\$\$$|^ {2}\$\$\n {2}([\s\S]*?)\n {2}\$\$$|\$\$([^\n]+?)\$\$|\\\(([^\n]+?)\\\)|\\\[([^\n]+?)\\\])/gm, (match, p1, p2, p3, p4, p5, p6, p7) => {
6722+
const key = `%%LATEXBLK${counter++}%%`;
6723+
latexBlocks.push({ key, value: match });
6724+
return key;
6725+
});
6726+
return { "text":text, "latexBlocks":latexBlocks };
6727+
}
6728+
function unstashLatex(text, latexBlocks) {
6729+
for(let i=0;i<latexBlocks.length;++i)
6730+
{
6731+
text = literalReplace(text,latexBlocks[i].key,latexBlocks[i].value);
6732+
}
6733+
return text;
6734+
}
67096735
const replaceLatex = (input) =>{
67106736
//all latex patterns except inline tex
67116737
input = input.replace(/^<blockquote>(\\\[\n[\s\S]*?\n\\\]) {0,2}<\/blockquote> {0,2}$/gm, (match, p1) => {
@@ -6720,6 +6746,10 @@ Current version indicated by LITEVER below.
67206746
{
67216747
return match;
67226748
}
6749+
if(p6 && p6.startsWith("$")) //buggy match, do not proceed
6750+
{
6751+
return match;
6752+
}
67236753
return leadingWhitespace + temml.renderToString(content); // render LaTeX content
67246754
});
67256755
input = input.replace(/^<blockquote>\n([\s\S]*?)\n<\/blockquote>/gm, (match, p1) => {
@@ -6732,6 +6762,10 @@ Current version indicated by LITEVER below.
67326762
{
67336763
return match;
67346764
}
6765+
if(p1 && p1.startsWith("$")) //buggy match, do not proceed
6766+
{
6767+
return match;
6768+
}
67356769
return prefix+temml.renderToString(content); // render LaTeX content
67366770
});
67376771
input = input.replace(/(^|[^\\])\$([A-Za-z0-9])\$(?!\d)/g, (match, prefix, p1) => { //single letter or number
@@ -6798,6 +6832,13 @@ Current version indicated by LITEVER below.
67986832
append_spcetg = true;
67996833
}
68006834

6835+
let stashres = null;
6836+
if(renderLatex)
6837+
{
6838+
stashres = stashLatex(md);
6839+
md = stashres.text;
6840+
}
6841+
68016842
md = md.replace(/^###### (.*?)\s*#*$/gm, "<h6>$1</h6>")
68026843
.replace(/^##### (.*?)\s*#*$/gm, "<h5>$1</h5>")
68036844
.replace(/^#### (.*?)\s*#*$/gm, "<h4>$1</h4>")
@@ -6865,6 +6906,10 @@ Current version indicated by LITEVER below.
68656906
md = md.replace(/<\/code\><\/pre\>\n<pre\><code\>/g, "\n");
68666907
if(renderLatex)
68676908
{
6909+
if(stashres!=null)
6910+
{
6911+
md = unstashLatex(md,stashres.latexBlocks);
6912+
}
68686913
//to aid the latex renderer, we temporarily add newlines between some blocks
68696914
md = md.replace(/<\/li><li>/gm, "</li>\n<li>");
68706915
md = replaceLatex(md);

ggml/src/ggml-alloc.c

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -392,12 +392,8 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
392392
free(alloc);
393393
}
394394

395-
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
396-
size_t max_size = 0;
397-
for (int i = 0; i < alloc->n_chunks; i++) {
398-
max_size += alloc->chunks[i]->max_size;
399-
}
400-
return max_size;
395+
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
396+
return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
401397
}
402398

403399

@@ -417,10 +413,8 @@ static void ggml_vbuffer_free(struct vbuffer * buf) {
417413
free(buf);
418414
}
419415

420-
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
421-
int n = 0;
422-
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
423-
return n;
416+
static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
417+
return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
424418
}
425419

426420
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
@@ -885,12 +879,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
885879
}
886880
}
887881

888-
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
889-
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
890-
891882
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
892-
if (new_size > cur_size || galloc->buffers[i] == NULL) {
883+
bool realloc = galloc->buffers[i] == NULL;
884+
size_t new_size = 0;
885+
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
886+
size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
887+
size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
888+
new_size += new_chunk_size;
889+
if (new_chunk_size > cur_chunk_size) {
890+
realloc = true;
891+
}
892+
}
893+
if (realloc) {
893894
#ifndef NDEBUG
895+
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
894896
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
895897
#endif
896898

ggml/src/ggml-metal/ggml-metal-common.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static bool ggml_mem_ranges_add_dst(ggml_mem_ranges_t mrs, const ggml_tensor * t
112112
}
113113

114114
bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
115-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
115+
for (int i = 0; i < GGML_MAX_SRC; i++) {
116116
if (tensor->src[i]) {
117117
ggml_mem_ranges_add_src(mrs, tensor->src[i]);
118118
}
@@ -173,7 +173,7 @@ static bool ggml_mem_ranges_check_dst(ggml_mem_ranges_t mrs, const ggml_tensor *
173173
}
174174

175175
bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
176-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
176+
for (int i = 0; i < GGML_MAX_SRC; i++) {
177177
if (tensor->src[i]) {
178178
if (!ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
179179
return false;

0 commit comments

Comments
 (0)