Skip to content

Commit a29364d

Browse files
committed
Merge remote-tracking branch 'origin/master' into gabe-l-hart/alora-support
* origin/master: Thinking model disabled assistant prefill (ggml-org#15404) Implement --log-colors with always/never/auto (ggml-org#15792) CUDA: fastdiv, launch bounds for mmvq + q8_1 quant (ggml-org#15802) tests : add --list-ops and --show-coverage options (ggml-org#15745) gguf: gguf_writer refactor (ggml-org#15691) kv-cache : fix SWA checks + disable cacheless iSWA (ggml-org#15811) model-conversion : add --embeddings flag to modelcard.template [no ci] (ggml-org#15801) chat : fixed crash when Hermes 2 <tool_call> had a newline before it (ggml-org#15639) chat : nemotron thinking & toolcalling support (ggml-org#15676) scripts : add Jinja tester PySide6 simple app (ggml-org#15756) llama : add support for EmbeddingGemma 300m (ggml-org#15798)
2 parents 5958557 + 5fac79c commit a29364d

30 files changed

+1581
-179
lines changed

common/arg.cpp

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,6 +1263,18 @@ static std::string list_builtin_chat_templates() {
12631263
return msg.str();
12641264
}
12651265

1266+
static bool is_truthy(const std::string & value) {
1267+
return value == "on" || value == "enabled" || value == "1";
1268+
}
1269+
1270+
static bool is_falsey(const std::string & value) {
1271+
return value == "off" || value == "disabled" || value == "0";
1272+
}
1273+
1274+
static bool is_autoy(const std::string & value) {
1275+
return value == "auto" || value == "-1";
1276+
}
1277+
12661278
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
12671279
// load dynamic backends
12681280
ggml_backend_load_all();
@@ -1544,21 +1556,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15441556
params.n_chunks = value;
15451557
}
15461558
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1547-
add_opt(common_arg(
1548-
{"-fa", "--flash-attn"}, "FA",
1549-
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
1550-
[](common_params & params, const std::string & value) {
1551-
if (value == "on" || value == "enabled" || value == "1") {
1552-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553-
} else if (value == "off" || value == "disabled" || value == "0") {
1554-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555-
} else if (value == "auto" || value == "-1") {
1556-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557-
} else {
1558-
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1559-
}
1560-
}
1561-
).set_env("LLAMA_ARG_FLASH_ATTN"));
1559+
add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
1560+
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
1561+
llama_flash_attn_type_name(params.flash_attn_type)),
1562+
[](common_params & params, const std::string & value) {
1563+
if (is_truthy(value)) {
1564+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1565+
} else if (is_falsey(value)) {
1566+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1567+
} else if (is_autoy(value)) {
1568+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1569+
} else {
1570+
throw std::runtime_error(
1571+
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1572+
}
1573+
}).set_env("LLAMA_ARG_FLASH_ATTN"));
15621574
add_opt(common_arg(
15631575
{"-p", "--prompt"}, "PROMPT",
15641576
"prompt to start generation with; for system message, use -sys",
@@ -3134,13 +3146,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31343146
common_log_set_file(common_log_main(), value.c_str());
31353147
}
31363148
));
3137-
add_opt(common_arg(
3138-
{"--log-colors"},
3139-
"Enable colored logging",
3140-
[](common_params &) {
3141-
common_log_set_colors(common_log_main(), true);
3142-
}
3143-
).set_env("LLAMA_LOG_COLORS"));
3149+
add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3150+
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3151+
"'auto' enables colors when output is to a terminal",
3152+
[](common_params &, const std::string & value) {
3153+
if (is_truthy(value)) {
3154+
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3155+
} else if (is_falsey(value)) {
3156+
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3157+
} else if (is_autoy(value)) {
3158+
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3159+
} else {
3160+
throw std::invalid_argument(
3161+
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3162+
}
3163+
}).set_env("LLAMA_LOG_COLORS"));
31443164
add_opt(common_arg(
31453165
{"-v", "--verbose", "--log-verbose"},
31463166
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",

common/chat.cpp

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
163163
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
164164
}
165165

166+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
167+
common_chat_templates_inputs dummy_inputs;
168+
common_chat_msg msg;
169+
msg.role = "user";
170+
msg.content = "test";
171+
dummy_inputs.messages = {msg};
172+
dummy_inputs.enable_thinking = false;
173+
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
174+
dummy_inputs.enable_thinking = true;
175+
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
176+
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
177+
}
178+
166179
template <>
167180
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
168181
std::vector<common_chat_msg> msgs;
@@ -623,6 +636,7 @@ const char * common_chat_format_name(common_chat_format format) {
623636
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
624637
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
625638
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
639+
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
626640
default:
627641
throw std::runtime_error("Unknown chat format");
628642
}
@@ -1184,6 +1198,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
11841198
});
11851199
return data;
11861200
}
1201+
1202+
static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1203+
common_chat_params data;
1204+
1205+
// Generate the prompt using the apply() function with the template
1206+
data.prompt = apply(tmpl, inputs);
1207+
data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
1208+
1209+
// Handle thinking tags appropriately based on inputs.enable_thinking
1210+
if (string_ends_with(data.prompt, "<think>\n")) {
1211+
if (!inputs.enable_thinking) {
1212+
data.prompt += "</think>";
1213+
} else {
1214+
data.thinking_forced_open = true;
1215+
}
1216+
}
1217+
1218+
// When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
1219+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
1220+
data.grammar_lazy = true;
1221+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1222+
auto schemas = json::array();
1223+
foreach_function(inputs.tools, [&](const json & tool) {
1224+
const auto & function = tool.at("function");
1225+
schemas.push_back({
1226+
{ "type", "object" },
1227+
{ "properties",
1228+
{
1229+
{ "name",
1230+
{
1231+
{ "type", "string" },
1232+
{ "const", function.at("name") },
1233+
} },
1234+
{ "arguments", function.at("parameters") },
1235+
} },
1236+
{ "required", json::array({ "name", "arguments" }) },
1237+
});
1238+
});
1239+
auto schema = json{
1240+
{ "type", "array" },
1241+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
1242+
{ "minItems", 1 },
1243+
};
1244+
if (!inputs.parallel_tool_calls) {
1245+
schema["maxItems"] = 1;
1246+
}
1247+
builder.add_rule("root",
1248+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1249+
"\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
1250+
" \"</TOOLCALL>\"");
1251+
});
1252+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1253+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
1254+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1255+
std::string(data.thinking_forced_open ?
1256+
"[\\s\\S]*?(</think>\\s*)" :
1257+
"(?:<think>[\\s\\S]*?</think>\\s*)?") +
1258+
"(<TOOLCALL>)[\\s\\S]*" });
1259+
}
1260+
return data;
1261+
}
11871262
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
11881263
if (!builder.syntax().parse_tool_calls) {
11891264
builder.add_content(builder.consume_rest());
@@ -1830,7 +1905,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
18301905
// If thinking_forced_open, then we capture the </think> tag in the grammar,
18311906
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
18321907
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1833-
"(\\s*"
1908+
"\\s*("
18341909
"(?:<tool_call>"
18351910
"|<function"
18361911
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
@@ -2060,6 +2135,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
20602135
}
20612136
}
20622137

2138+
static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
2139+
// Parse thinking tags
2140+
builder.try_parse_reasoning("<think>", "</think>");
2141+
if (!builder.syntax().parse_tool_calls) {
2142+
builder.add_content(builder.consume_rest());
2143+
return;
2144+
}
2145+
2146+
// Look for tool calls
2147+
static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
2148+
if (auto res = builder.try_find_regex(tool_call_regex)) {
2149+
builder.move_to(res->groups[0].end);
2150+
2151+
// Expect JSON array of tool calls
2152+
auto tool_calls_data = builder.consume_json();
2153+
if (tool_calls_data.json.is_array()) {
2154+
if (!builder.try_consume_literal("</TOOLCALL>")) {
2155+
throw common_chat_msg_partial_exception("Incomplete tool call");
2156+
}
2157+
builder.add_tool_calls(tool_calls_data.json);
2158+
} else {
2159+
throw common_chat_msg_partial_exception("Incomplete tool call");
2160+
}
2161+
}
2162+
builder.add_content(builder.consume_rest());
2163+
}
2164+
20632165
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
20642166
// Parse thinking tags first - this handles the main reasoning content
20652167
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2293,6 +2395,11 @@ static common_chat_params common_chat_templates_apply_jinja(
22932395
return common_chat_params_init_seed_oss(tmpl, params, inputs);
22942396
}
22952397

2398+
// Nemotron v2
2399+
if (src.find("<SPECIAL_10>") != std::string::npos) {
2400+
return common_chat_params_init_nemotron_v2(tmpl, params);
2401+
}
2402+
22962403
// Use generic handler when mixing tools + JSON schema.
22972404
// TODO: support that mix in handlers below.
22982405
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2454,6 +2561,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
24542561
case COMMON_CHAT_FORMAT_SEED_OSS:
24552562
common_chat_parse_seed_oss(builder);
24562563
break;
2564+
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
2565+
common_chat_parse_nemotron_v2(builder);
2566+
break;
24572567
default:
24582568
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
24592569
}

common/chat.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ enum common_chat_format {
112112
COMMON_CHAT_FORMAT_GRANITE,
113113
COMMON_CHAT_FORMAT_GPT_OSS,
114114
COMMON_CHAT_FORMAT_SEED_OSS,
115+
COMMON_CHAT_FORMAT_NEMOTRON_V2,
115116

116117
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
117118
};
@@ -198,6 +199,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
198199

199200
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
200201

202+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
203+
201204
// Parses a JSON array of messages in OpenAI's chat completion API format.
202205
// T can be std::string containing JSON or nlohmann::ordered_json
203206
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);

common/log.cpp

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,52 @@
44
#include <condition_variable>
55
#include <cstdarg>
66
#include <cstdio>
7+
#include <cstdlib>
8+
#include <cstring>
79
#include <mutex>
810
#include <sstream>
911
#include <thread>
1012
#include <vector>
1113

14+
#if defined(_WIN32)
15+
# include <io.h>
16+
# include <windows.h>
17+
# define isatty _isatty
18+
# define fileno _fileno
19+
#else
20+
# include <unistd.h>
21+
#endif // defined(_WIN32)
22+
1223
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
1324

1425
void common_log_set_verbosity_thold(int verbosity) {
1526
common_log_verbosity_thold = verbosity;
1627
}
1728

29+
// Auto-detect if colors should be enabled based on terminal and environment
30+
static bool common_log_should_use_colors_auto() {
31+
// Check NO_COLOR environment variable (https://no-color.org/)
32+
if (const char * no_color = std::getenv("NO_COLOR")) {
33+
if (no_color[0] != '\0') {
34+
return false;
35+
}
36+
}
37+
38+
// Check TERM environment variable
39+
if (const char * term = std::getenv("TERM")) {
40+
if (std::strcmp(term, "dumb") == 0) {
41+
return false;
42+
}
43+
}
44+
45+
// Check if stdout and stderr are connected to a terminal
46+
// We check both because log messages can go to either
47+
bool stdout_is_tty = isatty(fileno(stdout));
48+
bool stderr_is_tty = isatty(fileno(stderr));
49+
50+
return stdout_is_tty || stderr_is_tty;
51+
}
52+
1853
static int64_t t_us() {
1954
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
2055
}
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
353388

354389
struct common_log * common_log_main() {
355390
static struct common_log log;
391+
static std::once_flag init_flag;
392+
std::call_once(init_flag, [&]() {
393+
// Set default to auto-detect colors
394+
log.set_colors(common_log_should_use_colors_auto());
395+
});
356396

357397
return &log;
358398
}
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
380420
log->set_file(file);
381421
}
382422

383-
void common_log_set_colors(struct common_log * log, bool colors) {
384-
log->set_colors(colors);
423+
void common_log_set_colors(struct common_log * log, log_colors colors) {
424+
if (colors == LOG_COLORS_AUTO) {
425+
log->set_colors(common_log_should_use_colors_auto());
426+
return;
427+
}
428+
429+
if (colors == LOG_COLORS_DISABLED) {
430+
log->set_colors(false);
431+
return;
432+
}
433+
434+
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
435+
log->set_colors(true);
385436
}
386437

387438
void common_log_set_prefix(struct common_log * log, bool prefix) {

common/log.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@
2424
#define LOG_DEFAULT_DEBUG 1
2525
#define LOG_DEFAULT_LLAMA 0
2626

27+
enum log_colors {
28+
LOG_COLORS_AUTO = -1,
29+
LOG_COLORS_DISABLED = 0,
30+
LOG_COLORS_ENABLED = 1,
31+
};
32+
2733
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
2834
// set via common_log_set_verbosity()
2935
extern int common_log_verbosity_thold;
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
6571
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
6672
//
6773

68-
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
69-
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
70-
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
71-
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
74+
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
75+
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
76+
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
77+
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
7278

7379
// helper macros for logging
7480
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold

convert_hf_to_gguf.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5122,6 +5122,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
51225122
return [(self.map_tensor_name(name), data_torch)]
51235123

51245124

5125+
@ModelBase.register("Gemma3TextModel")
5126+
class EmbeddingGemma(Gemma3Model):
5127+
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
5128+
5129+
def set_gguf_parameters(self):
5130+
super().set_gguf_parameters()
5131+
self._try_set_pooling_type()
5132+
5133+
51255134
@ModelBase.register("Gemma3ForConditionalGeneration")
51265135
class Gemma3VisionModel(MmprojModel):
51275136
def set_gguf_parameters(self):

0 commit comments

Comments
 (0)