Skip to content

Commit a153710

Browse files
Merge pull request #246 from menloresearch/update-dev-from-master-2025-09-09-00-33
Sync master with upstream release b6423
2 parents 796170a + 7057faf commit a153710

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2630
-1465
lines changed

common/chat.cpp

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,7 @@ const char * common_chat_format_name(common_chat_format format) {
631631
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
632632
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
633633
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
634+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
634635
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
635636
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
636637
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
@@ -698,11 +699,13 @@ static void parse_json_tool_calls(
698699
size_t from = std::string::npos;
699700
auto first = true;
700701
while (true) {
702+
auto start_pos = builder.pos();
701703
auto res = function_regex_start_only && first
702704
? builder.try_consume_regex(*function_regex_start_only)
703705
: function_regex
704706
? builder.try_find_regex(*function_regex, from)
705707
: std::nullopt;
708+
706709
if (res) {
707710
std::string name;
708711
if (get_function_name) {
@@ -737,6 +740,8 @@ static void parse_json_tool_calls(
737740
return;
738741
}
739742
throw common_chat_msg_partial_exception("incomplete tool call");
743+
} else {
744+
builder.move_to(start_pos);
740745
}
741746
break;
742747
}
@@ -1388,6 +1393,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
13881393
}
13891394
return data;
13901395
}
1396+
1397+
static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1398+
common_chat_params data;
1399+
1400+
// Pass thinking context for DeepSeek V3.1 template
1401+
json additional_context = {
1402+
{"thinking", inputs.enable_thinking},
1403+
};
1404+
1405+
auto prompt = apply(tmpl, inputs,
1406+
/* messages_override= */ inputs.messages,
1407+
/* tools_override= */ std::nullopt,
1408+
additional_context);
1409+
data.prompt = prompt;
1410+
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
1411+
if (string_ends_with(data.prompt, "<think>")) {
1412+
if (!inputs.enable_thinking) {
1413+
data.prompt += "</think>";
1414+
} else {
1415+
data.thinking_forced_open = true;
1416+
}
1417+
}
1418+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
1419+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
1420+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1421+
std::vector<std::string> tool_rules;
1422+
foreach_function(inputs.tools, [&](const json & tool) {
1423+
const auto & function = tool.at("function");
1424+
std::string name = function.at("name");
1425+
auto parameters = function.at("parameters");
1426+
builder.resolve_refs(parameters);
1427+
tool_rules.push_back(builder.add_rule(name + "-call",
1428+
"( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
1429+
"\" " + builder.add_schema(name + "-args", parameters) + " "
1430+
"\"<|tool▁call▁end|>\""));
1431+
});
1432+
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
1433+
// so we accept common variants (then it's all constrained)
1434+
builder.add_rule("root",
1435+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1436+
"( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
1437+
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
1438+
"\"<|tool▁calls▁end|>\""
1439+
" space");
1440+
data.grammar_triggers.push_back({
1441+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1442+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1443+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1444+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1445+
"(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
1446+
});
1447+
data.preserved_tokens = {
1448+
"<think>",
1449+
"</think>",
1450+
"<|tool▁calls▁begin|>",
1451+
"<|tool▁call▁begin|>",
1452+
"<|tool▁sep|>",
1453+
"<|tool▁call▁end|>",
1454+
"<|tool▁calls▁end|>",
1455+
};
1456+
});
1457+
}
1458+
return data;
1459+
}
1460+
13911461
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
13921462
builder.try_parse_reasoning("<think>", "</think>");
13931463
if (!builder.syntax().parse_tool_calls) {
@@ -1409,6 +1479,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
14091479
tool_calls_end);
14101480
}
14111481

1482+
static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
1483+
static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
1484+
1485+
static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
1486+
static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
1487+
static const common_regex tool_calls_end("<|tool▁calls▁end|>");
1488+
1489+
if (!builder.syntax().parse_tool_calls) {
1490+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
1491+
builder.add_content(builder.consume_rest());
1492+
return;
1493+
}
1494+
1495+
LOG_DBG("%s: parse_tool_calls\n", __func__);
1496+
1497+
parse_json_tool_calls(
1498+
builder,
1499+
/* block_open= */ tool_calls_begin,
1500+
/* function_regex_start_only= */ std::nullopt,
1501+
function_regex,
1502+
close_regex,
1503+
tool_calls_end);
1504+
}
1505+
1506+
static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
1507+
// DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1508+
// First try to parse using the standard reasoning parsing method
1509+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1510+
1511+
auto start_pos = builder.pos();
1512+
auto found_end_think = builder.try_find_literal("</think>");
1513+
builder.move_to(start_pos);
1514+
1515+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1516+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1517+
common_chat_parse_deepseek_v3_1_content(builder);
1518+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
1519+
// If reasoning was parsed successfully, the remaining content is regular content
1520+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1521+
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
1522+
common_chat_parse_deepseek_v3_1_content(builder);
1523+
} else {
1524+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1525+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1526+
common_chat_parse_deepseek_v3_1_content(builder);
1527+
return;
1528+
}
1529+
// If no reasoning tags found, check if we should treat everything as reasoning
1530+
if (builder.syntax().thinking_forced_open) {
1531+
// If thinking is forced open but no tags found, treat everything as reasoning
1532+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1533+
builder.add_reasoning_content(builder.consume_rest());
1534+
} else {
1535+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1536+
// <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
1537+
common_chat_parse_deepseek_v3_1_content(builder);
1538+
}
1539+
}
1540+
}
1541+
14121542
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
14131543
common_chat_params data;
14141544
auto prompt = apply(tmpl, inputs);
@@ -2365,6 +2495,12 @@ static common_chat_params common_chat_templates_apply_jinja(
23652495
}
23662496
}
23672497

2498+
// DeepSeek V3.1: detect based on specific patterns in the template
2499+
if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
2500+
params.json_schema.is_null()) {
2501+
return common_chat_params_init_deepseek_v3_1(tmpl, params);
2502+
}
2503+
23682504
// DeepSeek R1: use handler in all cases except json schema (thinking / tools).
23692505
if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
23702506
return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2537,6 +2673,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
25372673
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
25382674
common_chat_parse_deepseek_r1(builder);
25392675
break;
2676+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
2677+
common_chat_parse_deepseek_v3_1(builder);
2678+
break;
25402679
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
25412680
common_chat_parse_functionary_v3_2(builder);
25422681
break;

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ enum common_chat_format {
107107
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
108108
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
109109
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
110+
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
110111
COMMON_CHAT_FORMAT_HERMES_2_PRO,
111112
COMMON_CHAT_FORMAT_COMMAND_R7B,
112113
COMMON_CHAT_FORMAT_GRANITE,

common/json-schema-to-grammar.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -843,9 +843,10 @@ class SchemaConverter {
843843
_build_object_rule(
844844
properties, required, name,
845845
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
846-
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
846+
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
847847
std::unordered_set<std::string> required;
848848
std::vector<std::pair<std::string, json>> properties;
849+
std::map<std::string, size_t> enum_values;
849850
std::string hybrid_name = name;
850851
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
851852
if (comp_schema.contains("$ref")) {
@@ -857,6 +858,14 @@ class SchemaConverter {
857858
required.insert(prop.key());
858859
}
859860
}
861+
} else if (comp_schema.contains("enum")) {
862+
for (const auto & v : comp_schema["enum"]) {
863+
const auto rule = _generate_constant_rule(v);
864+
if (enum_values.find(rule) == enum_values.end()) {
865+
enum_values[rule] = 0;
866+
}
867+
enum_values[rule] += 1;
868+
}
860869
} else {
861870
// todo warning
862871
}
@@ -870,6 +879,17 @@ class SchemaConverter {
870879
add_component(t, true);
871880
}
872881
}
882+
if (!enum_values.empty()) {
883+
std::vector<std::string> enum_intersection;
884+
for (const auto & p : enum_values) {
885+
if (p.second == schema["allOf"].size()) {
886+
enum_intersection.push_back(p.first);
887+
}
888+
}
889+
if (!enum_intersection.empty()) {
890+
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
891+
}
892+
}
873893
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
874894
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
875895
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];

convert_hf_to_gguf.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5128,6 +5128,20 @@ class EmbeddingGemma(Gemma3Model):
51285128

51295129
def set_gguf_parameters(self):
51305130
super().set_gguf_parameters()
5131+
5132+
# Override the sliding window size as it gets adjusted by the Gemma3TextConfig
5133+
# constructor. We want to use the value from the original model's config.json.
5134+
# ref: https://github.com/huggingface/transformers/pull/40700
5135+
with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
5136+
config = json.load(f)
5137+
orig_sliding_window = config.get("sliding_window")
5138+
if orig_sliding_window is None:
5139+
raise ValueError("sliding_window not found in model config - this is required for the model")
5140+
5141+
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
5142+
f"instead of {self.hparams['sliding_window']}")
5143+
self.gguf_writer.add_sliding_window(orig_sliding_window)
5144+
51315145
self._try_set_pooling_type()
51325146

51335147

examples/json_schema_to_grammar.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,9 +586,10 @@ def visit(self, schema, name):
586586
properties = list(schema.get('properties', {}).items())
587587
return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
588588

589-
elif schema_type in (None, 'object') and 'allOf' in schema:
589+
elif schema_type in (None, 'object', 'string') and 'allOf' in schema:
590590
required = set()
591591
properties = []
592+
enum_sets = []
592593
hybrid_name = name
593594
def add_component(comp_schema, is_required):
594595
if (ref := comp_schema.get('$ref')) is not None:
@@ -600,13 +601,25 @@ def add_component(comp_schema, is_required):
600601
if is_required:
601602
required.add(prop_name)
602603

604+
if 'enum' in comp_schema:
605+
enum_sets.append(set(comp_schema['enum']))
606+
603607
for t in schema['allOf']:
604608
if 'anyOf' in t:
605609
for tt in t['anyOf']:
606610
add_component(tt, is_required=False)
607611
else:
608612
add_component(t, is_required=True)
609613

614+
if enum_sets:
615+
enum_intersection = enum_sets[0]
616+
for s in enum_sets[1:]:
617+
enum_intersection &= s
618+
619+
if enum_intersection:
620+
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
621+
return self._add_rule(rule_name, rule)
622+
610623
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
611624

612625
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ extern "C" {
134134
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135135

136136
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
137+
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
137138
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
138139
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
139140
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

ggml/include/ggml.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1404,6 +1404,7 @@ extern "C" {
14041404
struct ggml_tensor * a,
14051405
struct ggml_tensor * b);
14061406

1407+
// note: casting from f32 to i32 will discard the fractional part
14071408
GGML_API struct ggml_tensor * ggml_cast(
14081409
struct ggml_context * ctx,
14091410
struct ggml_tensor * a,
@@ -1528,7 +1529,11 @@ extern "C" {
15281529
struct ggml_context * ctx,
15291530
struct ggml_tensor * a);
15301531

1531-
// supports 3D: a->ne[2] == b->ne[1]
1532+
// supports 4D a:
1533+
// a [n_embd, ne1, ne2, ne3]
1534+
// b I32 [n_rows, ne2, ne3, 1]
1535+
//
1536+
// return [n_embd, n_rows, ne2, ne3]
15321537
GGML_API struct ggml_tensor * ggml_get_rows(
15331538
struct ggml_context * ctx,
15341539
struct ggml_tensor * a, // data

ggml/src/ggml-backend-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ extern "C" {
114114
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
115115
// wait for an event on on a different stream
116116
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
117+
118+
// (optional) sort/optimize the nodes in the graph
119+
void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
117120
};
118121

119122
struct ggml_backend {

ggml/src/ggml-backend.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,13 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
463463
backend->iface.event_wait(backend, event);
464464
}
465465

466+
static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
467+
GGML_ASSERT(backend);
468+
if (backend->iface.optimize_graph != NULL) {
469+
backend->iface.optimize_graph(backend, cgraph);
470+
}
471+
}
472+
466473
// Backend device
467474

468475
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
@@ -1298,6 +1305,10 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
12981305
struct ggml_backend_sched_split * split = &sched->splits[i];
12991306
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
13001307

1308+
// Optimize this split of the graph. This needs to happen before we make graph_copy,
1309+
// so they are in sync.
1310+
ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
1311+
13011312
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
13021313
for (int j = 0; j < split->n_inputs; j++) {
13031314
assert(graph_copy->size > (graph_copy->n_nodes + 1));

ggml/src/ggml-blas/ggml-blas.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
270270
/* .graph_compute = */ ggml_backend_blas_graph_compute,
271271
/* .event_record = */ NULL,
272272
/* .event_wait = */ NULL,
273+
/* .optimize_graph = */ NULL,
273274
};
274275

275276
static ggml_guid_t ggml_backend_blas_guid(void) {

0 commit comments

Comments
 (0)