Skip to content

Commit 5de51b7

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/close-issue.yml # docs/build-s390x.md # examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp # ggml/CMakeLists.txt # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/kleidiai/kleidiai.cpp # ggml/src/ggml-cuda/fattn-tile-f16.cu # ggml/src/ggml-cuda/fattn.cu # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/tool_bench.py # tests/test-backend-ops.cpp # tools/batched-bench/batched-bench.cpp # tools/server/README.md
2 parents 73be216 + 233d773 commit 5de51b7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1991
-1301
lines changed

common/arg.cpp

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,18 @@ static std::string list_builtin_chat_templates() {
12651265
return msg.str();
12661266
}
12671267

1268+
static bool is_truthy(const std::string & value) {
1269+
return value == "on" || value == "enabled" || value == "1";
1270+
}
1271+
1272+
static bool is_falsey(const std::string & value) {
1273+
return value == "off" || value == "disabled" || value == "0";
1274+
}
1275+
1276+
static bool is_autoy(const std::string & value) {
1277+
return value == "auto" || value == "-1";
1278+
}
1279+
12681280
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
12691281
// load dynamic backends
12701282
ggml_backend_load_all();
@@ -1546,21 +1558,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15461558
params.n_chunks = value;
15471559
}
15481560
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1549-
add_opt(common_arg(
1550-
{"-fa", "--flash-attn"}, "FA",
1551-
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
1552-
[](common_params & params, const std::string & value) {
1553-
if (value == "on" || value == "enabled" || value == "1") {
1554-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1555-
} else if (value == "off" || value == "disabled" || value == "0") {
1556-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1557-
} else if (value == "auto" || value == "-1") {
1558-
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1559-
} else {
1560-
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1561-
}
1562-
}
1563-
).set_env("LLAMA_ARG_FLASH_ATTN"));
1561+
add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
1562+
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
1563+
llama_flash_attn_type_name(params.flash_attn_type)),
1564+
[](common_params & params, const std::string & value) {
1565+
if (is_truthy(value)) {
1566+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1567+
} else if (is_falsey(value)) {
1568+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1569+
} else if (is_autoy(value)) {
1570+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1571+
} else {
1572+
throw std::runtime_error(
1573+
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1574+
}
1575+
}).set_env("LLAMA_ARG_FLASH_ATTN"));
15641576
add_opt(common_arg(
15651577
{"-p", "--prompt"}, "PROMPT",
15661578
"prompt to start generation with; for system message, use -sys",
@@ -3136,13 +3148,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31363148
common_log_set_file(common_log_main(), value.c_str());
31373149
}
31383150
));
3139-
add_opt(common_arg(
3140-
{"--log-colors"},
3141-
"Enable colored logging",
3142-
[](common_params &) {
3143-
common_log_set_colors(common_log_main(), true);
3144-
}
3145-
).set_env("LLAMA_LOG_COLORS"));
3151+
add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
3152+
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3153+
"'auto' enables colors when output is to a terminal",
3154+
[](common_params &, const std::string & value) {
3155+
if (is_truthy(value)) {
3156+
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3157+
} else if (is_falsey(value)) {
3158+
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3159+
} else if (is_autoy(value)) {
3160+
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3161+
} else {
3162+
throw std::invalid_argument(
3163+
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
3164+
}
3165+
}).set_env("LLAMA_LOG_COLORS"));
31463166
add_opt(common_arg(
31473167
{"-v", "--verbose", "--log-verbose"},
31483168
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",

common/chat.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
163163
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
164164
}
165165

166+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
167+
common_chat_templates_inputs dummy_inputs;
168+
common_chat_msg msg;
169+
msg.role = "user";
170+
msg.content = "test";
171+
dummy_inputs.messages = {msg};
172+
dummy_inputs.enable_thinking = false;
173+
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
174+
dummy_inputs.enable_thinking = true;
175+
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
176+
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
177+
}
178+
166179
template <>
167180
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
168181
std::vector<common_chat_msg> msgs;

common/chat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
199199

200200
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
201201

202+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
203+
202204
// Parses a JSON array of messages in OpenAI's chat completion API format.
203205
// T can be std::string containing JSON or nlohmann::ordered_json
204206
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);

common/log.cpp

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,52 @@
44
#include <condition_variable>
55
#include <cstdarg>
66
#include <cstdio>
7+
#include <cstdlib>
8+
#include <cstring>
79
#include <mutex>
810
#include <sstream>
911
#include <thread>
1012
#include <vector>
1113

14+
#if defined(_WIN32)
15+
# include <io.h>
16+
# include <windows.h>
17+
# define isatty _isatty
18+
# define fileno _fileno
19+
#else
20+
# include <unistd.h>
21+
#endif // defined(_WIN32)
22+
1223
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
1324

1425
void common_log_set_verbosity_thold(int verbosity) {
1526
common_log_verbosity_thold = verbosity;
1627
}
1728

29+
// Auto-detect if colors should be enabled based on terminal and environment
30+
static bool common_log_should_use_colors_auto() {
31+
// Check NO_COLOR environment variable (https://no-color.org/)
32+
if (const char * no_color = std::getenv("NO_COLOR")) {
33+
if (no_color[0] != '\0') {
34+
return false;
35+
}
36+
}
37+
38+
// Check TERM environment variable
39+
if (const char * term = std::getenv("TERM")) {
40+
if (std::strcmp(term, "dumb") == 0) {
41+
return false;
42+
}
43+
}
44+
45+
// Check if stdout and stderr are connected to a terminal
46+
// We check both because log messages can go to either
47+
bool stdout_is_tty = isatty(fileno(stdout));
48+
bool stderr_is_tty = isatty(fileno(stderr));
49+
50+
return stdout_is_tty || stderr_is_tty;
51+
}
52+
1853
static int64_t t_us() {
1954
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
2055
}
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
353388

354389
struct common_log * common_log_main() {
355390
static struct common_log log;
391+
static std::once_flag init_flag;
392+
std::call_once(init_flag, [&]() {
393+
// Set default to auto-detect colors
394+
log.set_colors(common_log_should_use_colors_auto());
395+
});
356396

357397
return &log;
358398
}
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
380420
log->set_file(file);
381421
}
382422

383-
void common_log_set_colors(struct common_log * log, bool colors) {
384-
log->set_colors(colors);
423+
void common_log_set_colors(struct common_log * log, log_colors colors) {
424+
if (colors == LOG_COLORS_AUTO) {
425+
log->set_colors(common_log_should_use_colors_auto());
426+
return;
427+
}
428+
429+
if (colors == LOG_COLORS_DISABLED) {
430+
log->set_colors(false);
431+
return;
432+
}
433+
434+
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
435+
log->set_colors(true);
385436
}
386437

387438
void common_log_set_prefix(struct common_log * log, bool prefix) {

common/log.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@
2424
#define LOG_DEFAULT_DEBUG 1
2525
#define LOG_DEFAULT_LLAMA 0
2626

27+
enum log_colors {
28+
LOG_COLORS_AUTO = -1,
29+
LOG_COLORS_DISABLED = 0,
30+
LOG_COLORS_ENABLED = 1,
31+
};
32+
2733
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
2834
// set via common_log_set_verbosity()
2935
extern int common_log_verbosity_thold;
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
6571
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
6672
//
6773

68-
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
69-
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
70-
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
71-
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
74+
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
75+
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
76+
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
77+
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
7278

7379
// helper macros for logging
7480
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold

convert_hf_to_gguf.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5128,6 +5128,20 @@ class EmbeddingGemma(Gemma3Model):
51285128

51295129
def set_gguf_parameters(self):
51305130
super().set_gguf_parameters()
5131+
5132+
# Override the sliding window size as it gets adjusted by the Gemma3TextConfig
5133+
# constructor. We want to use the value from the original model's config.json.
5134+
# ref: https://github.com/huggingface/transformers/pull/40700
5135+
with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
5136+
config = json.load(f)
5137+
orig_sliding_window = config.get("sliding_window")
5138+
if orig_sliding_window is None:
5139+
raise ValueError("sliding_window not found in model config - this is required for the model")
5140+
5141+
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
5142+
f"instead of {self.hparams['sliding_window']}")
5143+
self.gguf_writer.add_sliding_window(orig_sliding_window)
5144+
51315145
self._try_set_pooling_type()
51325146

51335147

convert_lora_to_gguf.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from math import prod
1313
from pathlib import Path
1414
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
15-
from transformers import AutoConfig
15+
from transformers import AutoConfig, AutoTokenizer
1616

1717
import torch
1818

@@ -26,6 +26,8 @@
2626
# reuse model definitions from convert_hf_to_gguf.py
2727
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
2828

29+
from gguf.constants import GGUFValueType
30+
2931
logger = logging.getLogger("lora-to-gguf")
3032

3133

@@ -369,7 +371,31 @@ def set_type(self):
369371
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
370372

371373
def set_gguf_parameters(self):
374+
logger.debug("GGUF KV: %s = %d", gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
372375
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
376+
alora_invocation_tokens = lparams.get("alora_invocation_tokens")
377+
invocation_string = lparams.get("invocation_string")
378+
if invocation_string and not alora_invocation_tokens:
379+
logger.debug("Tokenizing invocation_string -> alora_invocation_tokens")
380+
base_model_path_or_id = hparams.get("_name_or_path")
381+
try:
382+
tokenizer = AutoTokenizer.from_pretrained(base_model_path_or_id)
383+
except ValueError:
384+
logger.error("Unable to load tokenizer from %s", base_model_path_or_id)
385+
raise
386+
# NOTE: There's an off-by-one with the older aLoRAs where
387+
# the invocation string includes the "<|start_of_turn|>"
388+
# token, but the adapters themselves were trained to
389+
# activate _after_ that first token, so we drop it here.
390+
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
391+
if alora_invocation_tokens:
392+
logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
393+
self.gguf_writer.add_key_value(
394+
gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS,
395+
alora_invocation_tokens,
396+
GGUFValueType.ARRAY,
397+
GGUFValueType.UINT32,
398+
)
373399

374400
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
375401
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters

ggml/include/ggml-cpu.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ extern "C" {
101101
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102102
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103103
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104-
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
105104
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
106105
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
107106

0 commit comments

Comments
 (0)