Skip to content

Commit 65ac66f

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 2ef71a7 + fe1c92c commit 65ac66f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2506
-1440
lines changed

common/chat.cpp

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,7 @@ const char * common_chat_format_name(common_chat_format format) {
631631
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
632632
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
633633
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
634+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
634635
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
635636
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
636637
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
@@ -698,11 +699,13 @@ static void parse_json_tool_calls(
698699
size_t from = std::string::npos;
699700
auto first = true;
700701
while (true) {
702+
auto start_pos = builder.pos();
701703
auto res = function_regex_start_only && first
702704
? builder.try_consume_regex(*function_regex_start_only)
703705
: function_regex
704706
? builder.try_find_regex(*function_regex, from)
705707
: std::nullopt;
708+
706709
if (res) {
707710
std::string name;
708711
if (get_function_name) {
@@ -737,6 +740,8 @@ static void parse_json_tool_calls(
737740
return;
738741
}
739742
throw common_chat_msg_partial_exception("incomplete tool call");
743+
} else {
744+
builder.move_to(start_pos);
740745
}
741746
break;
742747
}
@@ -1388,6 +1393,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
13881393
}
13891394
return data;
13901395
}
1396+
1397+
static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1398+
common_chat_params data;
1399+
1400+
// Pass thinking context for DeepSeek V3.1 template
1401+
json additional_context = {
1402+
{"thinking", inputs.enable_thinking},
1403+
};
1404+
1405+
auto prompt = apply(tmpl, inputs,
1406+
/* messages_override= */ inputs.messages,
1407+
/* tools_override= */ std::nullopt,
1408+
additional_context);
1409+
data.prompt = prompt;
1410+
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
1411+
if (string_ends_with(data.prompt, "<think>")) {
1412+
if (!inputs.enable_thinking) {
1413+
data.prompt += "</think>";
1414+
} else {
1415+
data.thinking_forced_open = true;
1416+
}
1417+
}
1418+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
1419+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
1420+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1421+
std::vector<std::string> tool_rules;
1422+
foreach_function(inputs.tools, [&](const json & tool) {
1423+
const auto & function = tool.at("function");
1424+
std::string name = function.at("name");
1425+
auto parameters = function.at("parameters");
1426+
builder.resolve_refs(parameters);
1427+
tool_rules.push_back(builder.add_rule(name + "-call",
1428+
"( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
1429+
"\" " + builder.add_schema(name + "-args", parameters) + " "
1430+
"\"<|tool▁call▁end|>\""));
1431+
});
1432+
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
1433+
// so we accept common variants (then it's all constrained)
1434+
builder.add_rule("root",
1435+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1436+
"( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
1437+
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
1438+
"\"<|tool▁calls▁end|>\""
1439+
" space");
1440+
data.grammar_triggers.push_back({
1441+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1442+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1443+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1444+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1445+
"(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
1446+
});
1447+
data.preserved_tokens = {
1448+
"<think>",
1449+
"</think>",
1450+
"<|tool▁calls▁begin|>",
1451+
"<|tool▁call▁begin|>",
1452+
"<|tool▁sep|>",
1453+
"<|tool▁call▁end|>",
1454+
"<|tool▁calls▁end|>",
1455+
};
1456+
});
1457+
}
1458+
return data;
1459+
}
1460+
13911461
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
13921462
builder.try_parse_reasoning("<think>", "</think>");
13931463
if (!builder.syntax().parse_tool_calls) {
@@ -1409,6 +1479,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
14091479
tool_calls_end);
14101480
}
14111481

1482+
static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
1483+
static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
1484+
1485+
static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
1486+
static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
1487+
static const common_regex tool_calls_end("<|tool▁calls▁end|>");
1488+
1489+
if (!builder.syntax().parse_tool_calls) {
1490+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
1491+
builder.add_content(builder.consume_rest());
1492+
return;
1493+
}
1494+
1495+
LOG_DBG("%s: parse_tool_calls\n", __func__);
1496+
1497+
parse_json_tool_calls(
1498+
builder,
1499+
/* block_open= */ tool_calls_begin,
1500+
/* function_regex_start_only= */ std::nullopt,
1501+
function_regex,
1502+
close_regex,
1503+
tool_calls_end);
1504+
}
1505+
1506+
static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
1507+
// DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1508+
// First try to parse using the standard reasoning parsing method
1509+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1510+
1511+
auto start_pos = builder.pos();
1512+
auto found_end_think = builder.try_find_literal("</think>");
1513+
builder.move_to(start_pos);
1514+
1515+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1516+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1517+
common_chat_parse_deepseek_v3_1_content(builder);
1518+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
1519+
// If reasoning was parsed successfully, the remaining content is regular content
1520+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1521+
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
1522+
common_chat_parse_deepseek_v3_1_content(builder);
1523+
} else {
1524+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1525+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1526+
common_chat_parse_deepseek_v3_1_content(builder);
1527+
return;
1528+
}
1529+
// If no reasoning tags found, check if we should treat everything as reasoning
1530+
if (builder.syntax().thinking_forced_open) {
1531+
// If thinking is forced open but no tags found, treat everything as reasoning
1532+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1533+
builder.add_reasoning_content(builder.consume_rest());
1534+
} else {
1535+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1536+
// <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
1537+
common_chat_parse_deepseek_v3_1_content(builder);
1538+
}
1539+
}
1540+
}
1541+
14121542
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
14131543
common_chat_params data;
14141544
auto prompt = apply(tmpl, inputs);
@@ -2365,6 +2495,12 @@ static common_chat_params common_chat_templates_apply_jinja(
23652495
}
23662496
}
23672497

2498+
// DeepSeek V3.1: detect based on specific patterns in the template
2499+
if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
2500+
params.json_schema.is_null()) {
2501+
return common_chat_params_init_deepseek_v3_1(tmpl, params);
2502+
}
2503+
23682504
// DeepSeek R1: use handler in all cases except json schema (thinking / tools).
23692505
if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
23702506
return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2537,6 +2673,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
25372673
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
25382674
common_chat_parse_deepseek_r1(builder);
25392675
break;
2676+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
2677+
common_chat_parse_deepseek_v3_1(builder);
2678+
break;
25402679
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
25412680
common_chat_parse_functionary_v3_2(builder);
25422681
break;

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ enum common_chat_format {
107107
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
108108
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
109109
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
110+
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
110111
COMMON_CHAT_FORMAT_HERMES_2_PRO,
111112
COMMON_CHAT_FORMAT_COMMAND_R7B,
112113
COMMON_CHAT_FORMAT_GRANITE,

convert_hf_to_gguf.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5128,6 +5128,20 @@ class EmbeddingGemma(Gemma3Model):
51285128

51295129
def set_gguf_parameters(self):
51305130
super().set_gguf_parameters()
5131+
5132+
# Override the sliding window size as it gets adjusted by the Gemma3TextConfig
5133+
# constructor. We want to use the value from the original model's config.json.
5134+
# ref: https://github.com/huggingface/transformers/pull/40700
5135+
with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
5136+
config = json.load(f)
5137+
orig_sliding_window = config.get("sliding_window")
5138+
if orig_sliding_window is None:
5139+
raise ValueError("sliding_window not found in model config - this is required for the model")
5140+
5141+
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
5142+
f"instead of {self.hparams['sliding_window']}")
5143+
self.gguf_writer.add_sliding_window(orig_sliding_window)
5144+
51315145
self._try_set_pooling_type()
51325146

51335147

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ extern "C" {
134134
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135135

136136
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137+
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
137138
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
138139
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
139140
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

ggml/include/ggml.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1404,6 +1404,7 @@ extern "C" {
14041404
struct ggml_tensor * a,
14051405
struct ggml_tensor * b);
14061406

1407+
// note: casting from f32 to i32 will discard the fractional part
14071408
GGML_API struct ggml_tensor * ggml_cast(
14081409
struct ggml_context * ctx,
14091410
struct ggml_tensor * a,
@@ -1528,7 +1529,11 @@ extern "C" {
15281529
struct ggml_context * ctx,
15291530
struct ggml_tensor * a);
15301531

1531-
// supports 3D: a->ne[2] == b->ne[1]
1532+
// supports 4D a:
1533+
// a [n_embd, ne1, ne2, ne3]
1534+
// b I32 [n_rows, ne2, ne3, 1]
1535+
//
1536+
// return [n_embd, n_rows, ne2, ne3]
15321537
GGML_API struct ggml_tensor * ggml_get_rows(
15331538
struct ggml_context * ctx,
15341539
struct ggml_tensor * a, // data

ggml/src/ggml-backend-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ extern "C" {
114114
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
115115
// wait for an event on on a different stream
116116
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
117+
118+
// (optional) sort/optimize the nodes in the graph
119+
void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
117120
};
118121

119122
struct ggml_backend {

ggml/src/ggml-backend.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,13 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
463463
backend->iface.event_wait(backend, event);
464464
}
465465

466+
static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
467+
GGML_ASSERT(backend);
468+
if (backend->iface.optimize_graph != NULL) {
469+
backend->iface.optimize_graph(backend, cgraph);
470+
}
471+
}
472+
466473
// Backend device
467474

468475
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
@@ -1298,6 +1305,10 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
12981305
struct ggml_backend_sched_split * split = &sched->splits[i];
12991306
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
13001307

1308+
// Optimize this split of the graph. This needs to happen before we make graph_copy,
1309+
// so they are in sync.
1310+
ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
1311+
13011312
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
13021313
for (int j = 0; j < split->n_inputs; j++) {
13031314
assert(graph_copy->size > (graph_copy->n_nodes + 1));

ggml/src/ggml-blas/ggml-blas.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
270270
/* .graph_compute = */ ggml_backend_blas_graph_compute,
271271
/* .event_record = */ NULL,
272272
/* .event_wait = */ NULL,
273+
/* .optimize_graph = */ NULL,
273274
};
274275

275276
static ggml_guid_t ggml_backend_blas_guid(void) {

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2690,6 +2690,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
26902690
/* .graph_compute = */ ggml_backend_cann_graph_compute,
26912691
/* .event_record = */ ggml_backend_cann_event_record,
26922692
/* .event_wait = */ ggml_backend_cann_event_wait,
2693+
/* .optimize_graph = */ NULL,
26932694
};
26942695

26952696
/**

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
373373
.vec_dot_type = GGML_TYPE_Q8_K,
374374
.nrows = 1,
375375
},
376+
[GGML_TYPE_I32] = {
377+
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
378+
},
376379
};
377380

378381
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -2696,7 +2699,10 @@ struct ggml_cplan ggml_graph_plan(
26962699
if (ggml_is_quantized(node->type) ||
26972700
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
26982701
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
2699-
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
2702+
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
2703+
// conversion between F32 and I32
2704+
(node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
2705+
(node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
27002706
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
27012707
}
27022708
} break;
@@ -3258,6 +3264,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
32583264
}
32593265
}
32603266

3267+
void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
3268+
int64_t i = 0;
3269+
for (; i < n; ++i) {
3270+
y[i] = x[i];
3271+
}
3272+
}
3273+
32613274
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
32623275
int64_t i = 0;
32633276
#if defined(__AVX2__)

0 commit comments

Comments
 (0)