Skip to content

Commit 3e19546

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents d82f529 + 9ebebef commit 3e19546

29 files changed

+694
-587
lines changed

β€ŽREADME.mdβ€Ž

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
151151
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
152152
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
153153
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
154+
- [x] [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)
154155

155156
</details>
156157

β€Žcommon/arg.cppβ€Ž

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22542254
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
22552255
add_opt(common_arg(
22562256
{"-dt", "--defrag-thold"}, "N",
2257-
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
2257+
string_format("KV cache defragmentation threshold (DEPRECATED)"),
22582258
[](common_params & params, const std::string & value) {
2259-
params.defrag_thold = std::stof(value);
2259+
GGML_UNUSED(params);
2260+
GGML_UNUSED(value);
2261+
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
22602262
}
22612263
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
22622264
add_opt(common_arg(

β€Žcommon/common.cppβ€Ž

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1152,7 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11521152
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
11531153
cparams.pooling_type = params.pooling_type;
11541154
cparams.attention_type = params.attention_type;
1155-
cparams.defrag_thold = params.defrag_thold;
11561155
cparams.cb_eval = params.cb_eval;
11571156
cparams.cb_eval_user_data = params.cb_eval_user_data;
11581157
cparams.offload_kqv = !params.no_kv_offload;

β€Žcommon/common.hβ€Ž

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,6 @@ struct common_params {
288288
float yarn_beta_fast = 32.0f; // YaRN low correction dim
289289
float yarn_beta_slow = 1.0f; // YaRN high correction dim
290290
int32_t yarn_orig_ctx = 0; // YaRN original context length
291-
float defrag_thold = 0.1f; // KV cache defragmentation threshold
292291

293292
// offload params
294293
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

β€Ždocs/build-s390x.mdβ€Ž

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,9 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
265265
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
266266
| Q4_0 | βœ… | βœ… | ❓ | ❓ |
267267
| Q4_1 | βœ… | βœ… | ❓ | ❓ |
268-
| Q5_0 | 🚫 | 🚫 | ❓ | ❓ |
269-
| Q5_1 | 🚫 | 🚫 | ❓ | ❓ |
268+
| MXFP4 | 🚫 | 🚫 | ❓ | ❓ |
269+
| Q5_0 | βœ… | βœ… | ❓ | ❓ |
270+
| Q5_1 | βœ… | βœ… | ❓ | ❓ |
270271
| Q8_0 | βœ… | βœ… | ❓ | ❓ |
271272
| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
272273
| Q3_K | βœ… | βœ… | ❓ | ❓ |
@@ -291,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
291292
- 🚫 - acceleration unavailable, will still run using scalar implementation
292293
- ❓ - acceleration unknown, please contribute if you can test it yourself
293294

294-
Last Updated by **Aaron Teo ([email protected])** on July 31, 2025.
295+
Last Updated by **Aaron Teo ([email protected])** on Aug 22, 2025.

β€Žexamples/llama.vimβ€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"
1818
" start the llama.cpp server with a FIM-compatible model. for example:
1919
"
20-
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
20+
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
2121
"
2222
" --batch-size [512, model max context]
2323
"

β€Žggml/src/ggml-cpu/arch-fallback.hβ€Ž

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,6 @@
150150
#elif defined(__s390x__)
151151
// quants.c
152152
#define quantize_row_q8_K_generic quantize_row_q8_K
153-
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
154-
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
155153
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
156154
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
157155
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K

0 commit comments

Comments
Β (0)