Skip to content

Commit f0d4128

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/backend/CANN.md # examples/model-conversion/Makefile # examples/model-conversion/scripts/causal/compare-embeddings-logits.sh # examples/model-conversion/scripts/causal/convert-model.sh # examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py # examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh # examples/model-conversion/scripts/causal/run-converted-model.sh # examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh # examples/model-conversion/scripts/embedding/convert-model.sh # examples/model-conversion/scripts/embedding/modelcard.template # examples/model-conversion/scripts/embedding/run-converted-model.sh # examples/model-conversion/scripts/utils/create-collection-add-model.sh # examples/model-conversion/scripts/utils/inspect-converted-model.sh # examples/model-conversion/scripts/utils/inspect-org-model.py # examples/model-conversion/scripts/utils/perplexity-gen.sh # examples/model-conversion/scripts/utils/perplexity-run-simple.sh # examples/model-conversion/scripts/utils/perplexity-run.sh # examples/model-conversion/scripts/utils/quantize.sh # examples/model-conversion/scripts/utils/run-embedding-server.sh # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # src/llama-context.cpp # tests/test-backend-ops.cpp # tests/test-chat.cpp
2 parents 979e211 + 5d6688d commit f0d4128

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1403
-251
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2468,7 +2468,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24682468
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
24692469
add_opt(common_arg(
24702470
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2471-
"number of layers to store in VRAM",
2471+
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
24722472
[](common_params & params, int value) {
24732473
params.n_gpu_layers = value;
24742474
if (!llama_supports_gpu_offload()) {

common/chat.cpp

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,7 @@ const char * common_chat_format_name(common_chat_format format) {
623623
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
624624
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
625625
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
626+
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
626627
default:
627628
throw std::runtime_error("Unknown chat format");
628629
}
@@ -1184,6 +1185,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
11841185
});
11851186
return data;
11861187
}
1188+
1189+
static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1190+
common_chat_params data;
1191+
1192+
// Generate the prompt using the apply() function with the template
1193+
data.prompt = apply(tmpl, inputs);
1194+
data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
1195+
1196+
// Handle thinking tags appropriately based on inputs.enable_thinking
1197+
if (string_ends_with(data.prompt, "<think>\n")) {
1198+
if (!inputs.enable_thinking) {
1199+
data.prompt += "</think>";
1200+
} else {
1201+
data.thinking_forced_open = true;
1202+
}
1203+
}
1204+
1205+
// When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
1206+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
1207+
data.grammar_lazy = true;
1208+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1209+
auto schemas = json::array();
1210+
foreach_function(inputs.tools, [&](const json & tool) {
1211+
const auto & function = tool.at("function");
1212+
schemas.push_back({
1213+
{ "type", "object" },
1214+
{ "properties",
1215+
{
1216+
{ "name",
1217+
{
1218+
{ "type", "string" },
1219+
{ "const", function.at("name") },
1220+
} },
1221+
{ "arguments", function.at("parameters") },
1222+
} },
1223+
{ "required", json::array({ "name", "arguments" }) },
1224+
});
1225+
});
1226+
auto schema = json{
1227+
{ "type", "array" },
1228+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
1229+
{ "minItems", 1 },
1230+
};
1231+
if (!inputs.parallel_tool_calls) {
1232+
schema["maxItems"] = 1;
1233+
}
1234+
builder.add_rule("root",
1235+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1236+
"\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
1237+
" \"</TOOLCALL>\"");
1238+
});
1239+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1240+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1241+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1242+
std::string(data.thinking_forced_open ?
1243+
"[\\s\\S]*?(</think>\\s*)" :
1244+
"(?:<think>[\\s\\S]*?</think>\\s*)?") +
1245+
"(<TOOLCALL>)[\\s\\S]*" });
1246+
}
1247+
return data;
1248+
}
11871249
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
11881250
if (!builder.syntax().parse_tool_calls) {
11891251
builder.add_content(builder.consume_rest());
@@ -1830,7 +1892,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
18301892
// If thinking_forced_open, then we capture the </think> tag in the grammar,
18311893
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
18321894
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1833-
"(\\s*"
1895+
"\\s*("
18341896
"(?:<tool_call>"
18351897
"|<function"
18361898
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
@@ -2060,6 +2122,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
20602122
}
20612123
}
20622124

2125+
static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
2126+
// Parse thinking tags
2127+
builder.try_parse_reasoning("<think>", "</think>");
2128+
if (!builder.syntax().parse_tool_calls) {
2129+
builder.add_content(builder.consume_rest());
2130+
return;
2131+
}
2132+
2133+
// Look for tool calls
2134+
static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
2135+
if (auto res = builder.try_find_regex(tool_call_regex)) {
2136+
builder.move_to(res->groups[0].end);
2137+
2138+
// Expect JSON array of tool calls
2139+
auto tool_calls_data = builder.consume_json();
2140+
if (tool_calls_data.json.is_array()) {
2141+
if (!builder.try_consume_literal("</TOOLCALL>")) {
2142+
throw common_chat_msg_partial_exception("Incomplete tool call");
2143+
}
2144+
builder.add_tool_calls(tool_calls_data.json);
2145+
} else {
2146+
throw common_chat_msg_partial_exception("Incomplete tool call");
2147+
}
2148+
}
2149+
builder.add_content(builder.consume_rest());
2150+
}
2151+
20632152
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
20642153
// Parse thinking tags first - this handles the main reasoning content
20652154
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2293,6 +2382,11 @@ static common_chat_params common_chat_templates_apply_jinja(
22932382
return common_chat_params_init_seed_oss(tmpl, params, inputs);
22942383
}
22952384

2385+
// Nemotron v2
2386+
if (src.find("<SPECIAL_10>") != std::string::npos) {
2387+
return common_chat_params_init_nemotron_v2(tmpl, params);
2388+
}
2389+
22962390
// Use generic handler when mixing tools + JSON schema.
22972391
// TODO: support that mix in handlers below.
22982392
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2454,6 +2548,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
24542548
case COMMON_CHAT_FORMAT_SEED_OSS:
24552549
common_chat_parse_seed_oss(builder);
24562550
break;
2551+
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
2552+
common_chat_parse_nemotron_v2(builder);
2553+
break;
24572554
default:
24582555
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
24592556
}

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ enum common_chat_format {
112112
COMMON_CHAT_FORMAT_GRANITE,
113113
COMMON_CHAT_FORMAT_GPT_OSS,
114114
COMMON_CHAT_FORMAT_SEED_OSS,
115+
COMMON_CHAT_FORMAT_NEMOTRON_V2,
115116

116117
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
117118
};

convert_hf_to_gguf.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5122,6 +5122,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51225122
return [(self.map_tensor_name(name), data_torch)]
51235123

51245124

5125+
@ModelBase.register("Gemma3TextModel")
5126+
class EmbeddingGemma(Gemma3Model):
5127+
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
5128+
5129+
def set_gguf_parameters(self):
5130+
super().set_gguf_parameters()
5131+
self._try_set_pooling_type()
5132+
5133+
51255134
@ModelBase.register("Gemma3ForConditionalGeneration")
51265135
class Gemma3VisionModel(MmprojModel):
51275136
def set_gguf_parameters(self):

examples/model-conversion/scripts/utils/curl-embedding-server.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/bin/bash
1+
#!/usr/bin/env bash
22
curl --request POST \
33
--url http://localhost:8080/embedding \
44
--header "Content-Type: application/json" \

ggml/include/ggml.h

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,7 @@ extern "C" {
517517
GGML_OP_CONV_TRANSPOSE_1D,
518518
GGML_OP_IM2COL,
519519
GGML_OP_IM2COL_BACK,
520+
GGML_OP_IM2COL_3D,
520521
GGML_OP_CONV_2D,
521522
GGML_OP_CONV_3D,
522523
GGML_OP_CONV_2D_DW,
@@ -1895,6 +1896,41 @@ extern "C" {
18951896
int d0, // dilation dimension 0
18961897
int d1); // dilation dimension 1
18971898

1899+
GGML_API struct ggml_tensor * ggml_im2col_3d(
1900+
struct ggml_context * ctx,
1901+
struct ggml_tensor * a,
1902+
struct ggml_tensor * b,
1903+
int64_t IC,
1904+
int s0, // stride width
1905+
int s1, // stride height
1906+
int s2, // stride depth
1907+
int p0, // padding width
1908+
int p1, // padding height
1909+
int p2, // padding depth
1910+
int d0, // dilation width
1911+
int d1, // dilation height
1912+
int d2, // dilation depth
1913+
enum ggml_type dst_type);
1914+
1915+
// a: [OC*IC, KD, KH, KW]
1916+
// b: [N*IC, ID, IH, IW]
1917+
// result: [N*OC, OD, OH, OW]
1918+
GGML_API struct ggml_tensor * ggml_conv_3d(
1919+
struct ggml_context * ctx,
1920+
struct ggml_tensor * a,
1921+
struct ggml_tensor * b,
1922+
int64_t IC,
1923+
int s0, // stride width
1924+
int s1, // stride height
1925+
int s2, // stride depth
1926+
int p0, // padding width
1927+
int p1, // padding height
1928+
int p2, // padding depth
1929+
int d0, // dilation width
1930+
int d1, // dilation height
1931+
int d2 // dilation depth
1932+
);
1933+
18981934
// kernel size is a->ne[0] x a->ne[1]
18991935
// stride is equal to kernel size
19001936
// padding is zero
@@ -1966,7 +2002,7 @@ extern "C" {
19662002
int d0, // dilation dimension 0
19672003
int d1); // dilation dimension 1
19682004

1969-
GGML_API struct ggml_tensor * ggml_conv_3d(
2005+
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
19702006
struct ggml_context * ctx,
19712007
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
19722008
struct ggml_tensor * b, // input [W, H, D, C * N]
@@ -2073,6 +2109,19 @@ extern "C" {
20732109
int p2,
20742110
int p3);
20752111

2112+
GGML_API struct ggml_tensor * ggml_pad_ext(
2113+
struct ggml_context * ctx,
2114+
struct ggml_tensor * a,
2115+
int lp0,
2116+
int rp0,
2117+
int lp1,
2118+
int rp1,
2119+
int lp2,
2120+
int rp2,
2121+
int lp3,
2122+
int rp3
2123+
);
2124+
20762125
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
20772126
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
20782127
struct ggml_context * ctx,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2660,6 +2660,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
26602660
{
26612661
ggml_compute_forward_im2col_back_f32(params, tensor);
26622662
} break;
2663+
case GGML_OP_IM2COL_3D:
2664+
{
2665+
ggml_compute_forward_im2col_3d(params, tensor);
2666+
} break;
26632667
case GGML_OP_CONV_2D:
26642668
{
26652669
ggml_compute_forward_conv_2d(params, tensor);
@@ -3080,6 +3084,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
30803084
} break;
30813085
case GGML_OP_IM2COL:
30823086
case GGML_OP_IM2COL_BACK:
3087+
case GGML_OP_IM2COL_3D:
30833088
case GGML_OP_CONV_2D:
30843089
case GGML_OP_CONV_3D:
30853090
case GGML_OP_CONV_2D_DW:

0 commit comments

Comments
 (0)