Skip to content

Commit 78f181e

Browse files
Merge pull request #201 from menloresearch/update-dev-from-master-2025-08-12-00-12
Sync master with upstream release b6133
2 parents fb8fbcd + cf9e564 commit 78f181e

File tree

21 files changed

+545
-277
lines changed

21 files changed

+545
-277
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
240240
<details>
241241
<summary>Infrastructure</summary>
242242

243-
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
243+
- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
244244
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
245245
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
246246
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server

common/arg.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2949,11 +2949,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29492949
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
29502950
"(default: auto)",
29512951
[](common_params & params, const std::string & value) {
2952-
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2953-
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2954-
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2955-
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
2956-
else { throw std::invalid_argument("invalid value"); }
2952+
params.reasoning_format = common_reasoning_format_from_name(value);
29572953
}
29582954
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
29592955
add_opt(common_arg(

common/chat.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,17 @@ common_chat_templates_ptr common_chat_templates_init(
552552
default_template_src = CHATML_TEMPLATE_SRC;
553553
}
554554
}
555+
556+
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
557+
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
558+
if (default_template_src.find("<|channel|>") != std::string::npos
559+
// search for the error message and patch it
560+
&& default_template_src.find("in message.content or") != std::string::npos) {
561+
string_replace_all(default_template_src,
562+
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
563+
"{%- if false %}");
564+
}
565+
555566
std::string token_bos = bos_token_override;
556567
std::string token_eos = eos_token_override;
557568
bool add_bos = false;
@@ -625,6 +636,19 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
625636
}
626637
}
627638

639+
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
640+
if (format == "none") {
641+
return COMMON_REASONING_FORMAT_NONE;
642+
} else if (format == "auto") {
643+
return COMMON_REASONING_FORMAT_AUTO;
644+
} else if (format == "deepseek") {
645+
return COMMON_REASONING_FORMAT_DEEPSEEK;
646+
} else if (format == "deepseek-legacy") {
647+
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
648+
}
649+
throw std::runtime_error("Unknown reasoning format: " + format);
650+
}
651+
628652
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
629653
std::string arguments;
630654
if (builder.is_partial()) {

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ std::string common_chat_format_example(
191191

192192
const char* common_chat_format_name(common_chat_format format);
193193
const char* common_reasoning_format_name(common_reasoning_format format);
194+
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
194195
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
195196

196197
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

convert_hf_to_gguf.py

Lines changed: 255 additions & 99 deletions
Large diffs are not rendered by default.

convert_lora_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
340340
sys.exit(1)
341341
else:
342342
logger.info(f"Loading base model: {dir_base_model.name}")
343-
hparams = ModelBase.load_hparams(dir_base_model)
343+
hparams = ModelBase.load_hparams(dir_base_model, False)
344344

345345
with torch.inference_mode():
346346
try:

docs/multimodal/minicpmo2.6.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
1313

1414
Clone llama.cpp:
1515
```bash
16-
git clone https://github.com/ggerganov/llama.cpp
16+
git clone https://github.com/ggml-org/llama.cpp
1717
cd llama.cpp
1818
```
1919

docs/multimodal/minicpmv2.6.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
1212

1313
Clone llama.cpp:
1414
```bash
15-
git clone https://github.com/ggerganov/llama.cpp
15+
git clone https://github.com/ggml-org/llama.cpp
1616
cd llama.cpp
1717
```
1818

ggml/src/ggml-cpu/kleidiai/kleidiai.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
259259
const int64_t m_start = 0;
260260

261261
const int64_t n_step = static_cast<int64_t>(kernel->get_n_step());
262-
const int64_t num_threads = KAI_MIN(n / n_step, nth);
262+
int64_t num_threads = KAI_MIN(n / n_step, nth);
263+
if (num_threads <= 0) {
264+
num_threads = 1;
265+
}
263266

264267
if (ith < num_threads) {
265268
const int64_t num_n_per_thread0 = round_down(n / num_threads, n_step);
@@ -309,7 +312,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
309312
GGML_ASSERT(kernel);
310313

311314
const int ith = params->ith;
312-
const int nth = params->nth;
315+
const int nth_raw = params->nth;
316+
const int nth = nth_raw > 0 ? nth_raw : 1;
313317

314318
const size_t k = ne00;
315319
const size_t m = ne11;
@@ -327,9 +331,12 @@ class tensor_traits : public ggml::cpu::tensor_traits {
327331
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
328332
const size_t n_start = ith * num_n_per_thread;
329333

330-
size_t n_to_process = num_n_per_thread;
331-
if ((n_start + n_to_process) > n) {
332-
n_to_process = n - n_start;
334+
size_t n_to_process = 0;
335+
if (n_start < n) {
336+
n_to_process = num_n_per_thread;
337+
if ((n_start + n_to_process) > n) {
338+
n_to_process = n - n_start;
339+
}
333340
}
334341

335342
// Calculate number of columns to be processed per thread
@@ -361,8 +368,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
361368
const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset);
362369
float *dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
363370

364-
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
365-
sizeof(float), -FLT_MAX, FLT_MAX);
371+
if (n_to_process > 0) {
372+
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
373+
sizeof(float), -FLT_MAX, FLT_MAX);
374+
}
366375

367376
return true;
368377
}

gguf-py/gguf/tensor_mapping.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,7 +1119,8 @@ class TensorNameMap:
11191119
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
11201120
"vpm.embeddings.patch_embedding",
11211121
"model.vision_model.embeddings.patch_embedding", # SmolVLM
1122-
"vision_tower.patch_conv", # pixtral
1122+
"vision_tower.patch_conv", # pixtral-hf
1123+
"vision_encoder.patch_conv", # pixtral
11231124
"vision_model.patch_embedding.linear", # llama 4
11241125
"visual.patch_embed.proj", # qwen2vl
11251126
),
@@ -1138,7 +1139,8 @@ class TensorNameMap:
11381139
"vpm.encoder.layers.{bid}.self_attn.q_proj",
11391140
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
11401141
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
1141-
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
1142+
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
1143+
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
11421144
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
11431145
),
11441146

@@ -1153,7 +1155,8 @@ class TensorNameMap:
11531155
"vpm.encoder.layers.{bid}.self_attn.k_proj",
11541156
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
11551157
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
1156-
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
1158+
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
1159+
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
11571160
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
11581161
),
11591162

@@ -1168,7 +1171,8 @@ class TensorNameMap:
11681171
"vpm.encoder.layers.{bid}.self_attn.v_proj",
11691172
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
11701173
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
1171-
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
1174+
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
1175+
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
11721176
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
11731177
),
11741178

@@ -1178,7 +1182,8 @@ class TensorNameMap:
11781182
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
11791183
"vpm.encoder.layers.{bid}.layer_norm1",
11801184
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
1181-
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
1185+
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
1186+
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
11821187
"vision_model.model.layers.{bid}.input_layernorm", # llama4
11831188
"visual.blocks.{bid}.norm1", # qwen2vl
11841189
),
@@ -1190,7 +1195,8 @@ class TensorNameMap:
11901195
"vpm.encoder.layers.{bid}.self_attn.out_proj",
11911196
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
11921197
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
1193-
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
1198+
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
1199+
"vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral
11941200
"visual.blocks.{bid}.attn.proj", # qwen2vl
11951201
),
11961202

@@ -1201,7 +1207,8 @@ class TensorNameMap:
12011207
"vpm.encoder.layers.{bid}.layer_norm2",
12021208
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
12031209
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
1204-
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
1210+
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
1211+
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
12051212
"visual.blocks.{bid}.norm2", # qwen2vl
12061213
),
12071214

@@ -1210,14 +1217,16 @@ class TensorNameMap:
12101217
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
12111218
"vpm.encoder.layers.{bid}.mlp.fc1",
12121219
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
1213-
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
1220+
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
1221+
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
12141222
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
12151223
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
12161224
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
12171225
),
12181226

12191227
MODEL_TENSOR.V_ENC_FFN_GATE: (
1220-
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
1228+
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
1229+
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
12211230
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
12221231
),
12231232

@@ -1226,7 +1235,8 @@ class TensorNameMap:
12261235
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
12271236
"vpm.encoder.layers.{bid}.mlp.fc2",
12281237
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
1229-
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
1238+
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
1239+
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
12301240
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
12311241
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
12321242
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
@@ -1244,7 +1254,8 @@ class TensorNameMap:
12441254

12451255
MODEL_TENSOR.V_PRE_NORM: (
12461256
"vision_tower.vision_model.pre_layrnorm",
1247-
"vision_tower.ln_pre", # pixtral
1257+
"vision_tower.ln_pre", # pixtral-hf
1258+
"vision_encoder.ln_pre", # pixtral
12481259
"vision_model.layernorm_pre", # llama4
12491260
),
12501261

@@ -1261,6 +1272,7 @@ class TensorNameMap:
12611272

12621273
MODEL_TENSOR.V_MM_INP_NORM: (
12631274
"multi_modal_projector.norm",
1275+
"pre_mm_projector_norm",
12641276
),
12651277

12661278
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
@@ -1316,7 +1328,8 @@ class TensorNameMap:
13161328
),
13171329

13181330
MODEL_TENSOR.V_MM_PATCH_MERGER: (
1319-
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
1331+
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
1332+
"patch_merger.merging_layer", # mistral
13201333
),
13211334

13221335
# audio (mtmd)

0 commit comments

Comments
 (0)