Skip to content

Commit 0b3190b

Browse files
Merge pull request #242 from menloresearch/update-dev-from-master-2025-09-05-00-33
Sync master with upstream release b6387
2 parents ff2d336 + 4fd1242 commit 0b3190b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+2073
-189
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2466,7 +2466,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24662466
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
24672467
add_opt(common_arg(
24682468
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2469-
"number of layers to store in VRAM",
2469+
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
24702470
[](common_params & params, int value) {
24712471
params.n_gpu_layers = value;
24722472
if (!llama_supports_gpu_offload()) {

common/chat.cpp

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,7 @@ const char * common_chat_format_name(common_chat_format format) {
623623
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
624624
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
625625
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
626+
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
626627
default:
627628
throw std::runtime_error("Unknown chat format");
628629
}
@@ -1184,6 +1185,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
11841185
});
11851186
return data;
11861187
}
1188+
1189+
static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1190+
common_chat_params data;
1191+
1192+
// Generate the prompt using the apply() function with the template
1193+
data.prompt = apply(tmpl, inputs);
1194+
data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
1195+
1196+
// Handle thinking tags appropriately based on inputs.enable_thinking
1197+
if (string_ends_with(data.prompt, "<think>\n")) {
1198+
if (!inputs.enable_thinking) {
1199+
data.prompt += "</think>";
1200+
} else {
1201+
data.thinking_forced_open = true;
1202+
}
1203+
}
1204+
1205+
// When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
1206+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
1207+
data.grammar_lazy = true;
1208+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1209+
auto schemas = json::array();
1210+
foreach_function(inputs.tools, [&](const json & tool) {
1211+
const auto & function = tool.at("function");
1212+
schemas.push_back({
1213+
{ "type", "object" },
1214+
{ "properties",
1215+
{
1216+
{ "name",
1217+
{
1218+
{ "type", "string" },
1219+
{ "const", function.at("name") },
1220+
} },
1221+
{ "arguments", function.at("parameters") },
1222+
} },
1223+
{ "required", json::array({ "name", "arguments" }) },
1224+
});
1225+
});
1226+
auto schema = json{
1227+
{ "type", "array" },
1228+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
1229+
{ "minItems", 1 },
1230+
};
1231+
if (!inputs.parallel_tool_calls) {
1232+
schema["maxItems"] = 1;
1233+
}
1234+
builder.add_rule("root",
1235+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1236+
"\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
1237+
" \"</TOOLCALL>\"");
1238+
});
1239+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1240+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1241+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1242+
std::string(data.thinking_forced_open ?
1243+
"[\\s\\S]*?(</think>\\s*)" :
1244+
"(?:<think>[\\s\\S]*?</think>\\s*)?") +
1245+
"(<TOOLCALL>)[\\s\\S]*" });
1246+
}
1247+
return data;
1248+
}
11871249
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
11881250
if (!builder.syntax().parse_tool_calls) {
11891251
builder.add_content(builder.consume_rest());
@@ -1830,7 +1892,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
18301892
// If thinking_forced_open, then we capture the </think> tag in the grammar,
18311893
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
18321894
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1833-
"(\\s*"
1895+
"\\s*("
18341896
"(?:<tool_call>"
18351897
"|<function"
18361898
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
@@ -2060,6 +2122,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
20602122
}
20612123
}
20622124

2125+
static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
2126+
// Parse thinking tags
2127+
builder.try_parse_reasoning("<think>", "</think>");
2128+
if (!builder.syntax().parse_tool_calls) {
2129+
builder.add_content(builder.consume_rest());
2130+
return;
2131+
}
2132+
2133+
// Look for tool calls
2134+
static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
2135+
if (auto res = builder.try_find_regex(tool_call_regex)) {
2136+
builder.move_to(res->groups[0].end);
2137+
2138+
// Expect JSON array of tool calls
2139+
auto tool_calls_data = builder.consume_json();
2140+
if (tool_calls_data.json.is_array()) {
2141+
if (!builder.try_consume_literal("</TOOLCALL>")) {
2142+
throw common_chat_msg_partial_exception("Incomplete tool call");
2143+
}
2144+
builder.add_tool_calls(tool_calls_data.json);
2145+
} else {
2146+
throw common_chat_msg_partial_exception("Incomplete tool call");
2147+
}
2148+
}
2149+
builder.add_content(builder.consume_rest());
2150+
}
2151+
20632152
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
20642153
// Parse thinking tags first - this handles the main reasoning content
20652154
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2293,6 +2382,11 @@ static common_chat_params common_chat_templates_apply_jinja(
22932382
return common_chat_params_init_seed_oss(tmpl, params, inputs);
22942383
}
22952384

2385+
// Nemotron v2
2386+
if (src.find("<SPECIAL_10>") != std::string::npos) {
2387+
return common_chat_params_init_nemotron_v2(tmpl, params);
2388+
}
2389+
22962390
// Use generic handler when mixing tools + JSON schema.
22972391
// TODO: support that mix in handlers below.
22982392
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2454,6 +2548,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
24542548
case COMMON_CHAT_FORMAT_SEED_OSS:
24552549
common_chat_parse_seed_oss(builder);
24562550
break;
2551+
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
2552+
common_chat_parse_nemotron_v2(builder);
2553+
break;
24572554
default:
24582555
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
24592556
}

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ enum common_chat_format {
112112
COMMON_CHAT_FORMAT_GRANITE,
113113
COMMON_CHAT_FORMAT_GPT_OSS,
114114
COMMON_CHAT_FORMAT_SEED_OSS,
115+
COMMON_CHAT_FORMAT_NEMOTRON_V2,
115116

116117
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
117118
};

convert_hf_to_gguf.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5122,6 +5122,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51225122
return [(self.map_tensor_name(name), data_torch)]
51235123

51245124

5125+
@ModelBase.register("Gemma3TextModel")
5126+
class EmbeddingGemma(Gemma3Model):
5127+
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
5128+
5129+
def set_gguf_parameters(self):
5130+
super().set_gguf_parameters()
5131+
self._try_set_pooling_type()
5132+
5133+
51255134
@ModelBase.register("Gemma3ForConditionalGeneration")
51265135
class Gemma3VisionModel(MmprojModel):
51275136
def set_gguf_parameters(self):

docs/backend/CANN.md

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -293,17 +293,14 @@ We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers fr
293293

294294
## Environment variable setup
295295

296-
### GGML_CANN_ASYNC_MODE
297-
298-
Enables asynchronous operator submission. Disabled by default.
299-
300296
### GGML_CANN_MEM_POOL
301297

302-
Specifies the memory pool management strategy:
298+
Specifies the memory pool management strategy, Default is vmm.
303299

304300
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
305301

306302
- prio: Employs a priority queue-based memory pool management.
303+
307304
- leg: Uses a fixed-size buffer pool.
308305

309306
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
@@ -312,9 +309,8 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
312309

313310
### GGML_CANN_WEIGHT_NZ
314311

315-
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
312+
Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.
316313

317-
### GGML_CANN_DISABLE_ACL_GRAPH
314+
### GGML_CANN_ACL_GRAPH
318315

319-
When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
320-
This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.
316+
Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.

ggml/include/ggml.h

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,7 @@ extern "C" {
511511
GGML_OP_CONV_TRANSPOSE_1D,
512512
GGML_OP_IM2COL,
513513
GGML_OP_IM2COL_BACK,
514+
GGML_OP_IM2COL_3D,
514515
GGML_OP_CONV_2D,
515516
GGML_OP_CONV_3D,
516517
GGML_OP_CONV_2D_DW,
@@ -1870,6 +1871,41 @@ extern "C" {
18701871
int d0, // dilation dimension 0
18711872
int d1); // dilation dimension 1
18721873

1874+
GGML_API struct ggml_tensor * ggml_im2col_3d(
1875+
struct ggml_context * ctx,
1876+
struct ggml_tensor * a,
1877+
struct ggml_tensor * b,
1878+
int64_t IC,
1879+
int s0, // stride width
1880+
int s1, // stride height
1881+
int s2, // stride depth
1882+
int p0, // padding width
1883+
int p1, // padding height
1884+
int p2, // padding depth
1885+
int d0, // dilation width
1886+
int d1, // dilation height
1887+
int d2, // dilation depth
1888+
enum ggml_type dst_type);
1889+
1890+
// a: [OC*IC, KD, KH, KW]
1891+
// b: [N*IC, ID, IH, IW]
1892+
// result: [N*OC, OD, OH, OW]
1893+
GGML_API struct ggml_tensor * ggml_conv_3d(
1894+
struct ggml_context * ctx,
1895+
struct ggml_tensor * a,
1896+
struct ggml_tensor * b,
1897+
int64_t IC,
1898+
int s0, // stride width
1899+
int s1, // stride height
1900+
int s2, // stride depth
1901+
int p0, // padding width
1902+
int p1, // padding height
1903+
int p2, // padding depth
1904+
int d0, // dilation width
1905+
int d1, // dilation height
1906+
int d2 // dilation depth
1907+
);
1908+
18731909
// kernel size is a->ne[0] x a->ne[1]
18741910
// stride is equal to kernel size
18751911
// padding is zero
@@ -1941,7 +1977,7 @@ extern "C" {
19411977
int d0, // dilation dimension 0
19421978
int d1); // dilation dimension 1
19431979

1944-
GGML_API struct ggml_tensor * ggml_conv_3d(
1980+
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
19451981
struct ggml_context * ctx,
19461982
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
19471983
struct ggml_tensor * b, // input [W, H, D, C * N]
@@ -2048,6 +2084,19 @@ extern "C" {
20482084
int p2,
20492085
int p3);
20502086

2087+
GGML_API struct ggml_tensor * ggml_pad_ext(
2088+
struct ggml_context * ctx,
2089+
struct ggml_tensor * a,
2090+
int lp0,
2091+
int rp0,
2092+
int lp1,
2093+
int rp1,
2094+
int lp2,
2095+
int rp2,
2096+
int lp3,
2097+
int rp3
2098+
);
2099+
20512100
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
20522101
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
20532102
struct ggml_context * ctx,

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -589,9 +589,16 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
589589
// the position of elements in the array means which dirction to padding,
590590
// each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
591591
// dim2.front, dim2.behind, dim3.front, dim3.behind]
592-
int64_t paddings[] = {
593-
0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
594-
0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
592+
const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
593+
const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
594+
const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
595+
const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
596+
const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
597+
const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
598+
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
599+
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
600+
601+
int64_t paddings[] = {lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3};
595602
aclnn_pad(ctx, acl_src, acl_dst, paddings);
596603
ggml_cann_release_resources(ctx, acl_src, acl_dst);
597604
}
@@ -975,18 +982,19 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
975982
);
976983

977984
// build rstd, zero...
978-
size_t acl_rstd_nb[GGML_MAX_DIMS];
985+
int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]};
986+
size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
979987
acl_rstd_nb[0] = sizeof(float);
980-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
981-
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
988+
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
989+
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
982990
}
983991
aclTensor* acl_rstd = get_f32_cache_acl_tensor(
984992
ctx,
985993
&ctx.rms_norm_zero_tensor_cache.cache,
986994
ctx.rms_norm_zero_tensor_cache.size,
987-
src->ne,
995+
acl_rstd_ne,
988996
acl_rstd_nb,
989-
GGML_MAX_DIMS,
997+
GGML_MAX_DIMS - 1,
990998
0.0f // value
991999
);
9921000

@@ -1955,7 +1963,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
19551963
aclTensor* acl_weight_tensor;
19561964

19571965
// Only check env once.
1958-
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
1966+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
19591967
if (weight_to_nz && is_matmul_weight(weight)) {
19601968
int64_t acl_stride[2] = {1, transpose_ne[1]};
19611969

ggml/src/ggml-cann/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ struct ggml_backend_cann_context {
420420
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
421421
device, async_mode ? "ON" : "OFF");
422422
#ifdef USE_ACL_GRAPH
423-
acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or("")));
423+
acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
424424
GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
425425
__func__, device,
426426
acl_graph_mode ? "GRAPH" : "EAGER",

0 commit comments

Comments
 (0)