Skip to content

Commit cef9b30

Browse files
authored
Merge branch 'ggml-org:master' into seed_oss
2 parents 7c2b3e0 + 9ebebef commit cef9b30

31 files changed

+817
-625
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
151151
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
152152
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
153153
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
154+
- [x] [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)
154155

155156
</details>
156157

common/arg.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22542254
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
22552255
add_opt(common_arg(
22562256
{"-dt", "--defrag-thold"}, "N",
2257-
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
2257+
string_format("KV cache defragmentation threshold (DEPRECATED)"),
22582258
[](common_params & params, const std::string & value) {
2259-
params.defrag_thold = std::stof(value);
2259+
GGML_UNUSED(params);
2260+
GGML_UNUSED(value);
2261+
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
22602262
}
22612263
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
22622264
add_opt(common_arg(

common/common.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1152,7 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11521152
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
11531153
cparams.pooling_type = params.pooling_type;
11541154
cparams.attention_type = params.attention_type;
1155-
cparams.defrag_thold = params.defrag_thold;
11561155
cparams.cb_eval = params.cb_eval;
11571156
cparams.cb_eval_user_data = params.cb_eval_user_data;
11581157
cparams.offload_kqv = !params.no_kv_offload;

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,6 @@ struct common_params {
288288
float yarn_beta_fast = 32.0f; // YaRN low correction dim
289289
float yarn_beta_slow = 1.0f; // YaRN high correction dim
290290
int32_t yarn_orig_ctx = 0; // YaRN original context length
291-
float defrag_thold = 0.1f; // KV cache defragmentation threshold
292291

293292
// offload params
294293
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

docs/build-s390x.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,9 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
265265
| BF16 | 🚫 | 🚫 |||
266266
| Q4_0 |||||
267267
| Q4_1 |||||
268-
| Q5_0 | 🚫 | 🚫 |||
269-
| Q5_1 | 🚫 | 🚫 |||
268+
| MXFP4 | 🚫 | 🚫 |||
269+
| Q5_0 |||||
270+
| Q5_1 |||||
270271
| Q8_0 |||||
271272
| Q2_K | 🚫 | 🚫 |||
272273
| Q3_K |||||
@@ -291,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
291292
- 🚫 - acceleration unavailable, will still run using scalar implementation
292293
- ❓ - acceleration unknown, please contribute if you can test it yourself
293294

294-
Last Updated by **Aaron Teo ([email protected])** on July 31, 2025.
295+
Last Updated by **Aaron Teo ([email protected])** on Aug 22, 2025.

examples/llama.vim

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"
1818
" start the llama.cpp server with a FIM-compatible model. for example:
1919
"
20-
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
20+
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
2121
"
2222
" --batch-size [512, model max context]
2323
"

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 119 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,86 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
867867
return acl_tensor;
868868
}
869869

870+
/**
871+
* @brief Fills a tensor with a scalar value.
872+
*
873+
* This function fills the destination tensor `acl_dst` with the scalar value
874+
* `scalar`.
875+
*
876+
* @param ctx The context for the CANN backend operations.
877+
* @param scalar The scalar value used to fill the tensor.
878+
* @param acl_dst The destination tensor to be filled with the scalar value.
879+
*/
880+
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
881+
aclTensor* acl_dst) {
882+
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
883+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
884+
ggml_cann_release_resources(ctx, acl_scalar);
885+
}
886+
887+
/**
888+
* @brief Get or expand a cached float32 tensor filled with a scalar value.
889+
*
890+
* This function manages cached device memory for float32 tensors. If the current
891+
* cache size is insufficient for the requested tensor shape, the old memory will
892+
* be released and new memory will be allocated. The allocated buffer is then
893+
* initialized either with zeros (when @p value == 0.0f) or with the given scalar
894+
* value using CANN operations. Finally, an aclTensor object is created from the
895+
* cached memory and returned.
896+
*
897+
* @param ctx The CANN backend context that manages device memory.
898+
* @param buffer A pointer to the cached device buffer (will be allocated
899+
* or reallocated if necessary).
900+
* @param cache_element The current number of cached elements. This will be
901+
* updated when the cache is expanded.
902+
* @param ne The tensor shape array (number of elements in each dimension).
903+
* @param nb The stride size for each dimension.
904+
* @param dims The number of tensor dimensions.
905+
* @param value The scalar value used to fill the tensor (supports zero
906+
* initialization via memset or arbitrary values via fill_scalar).
907+
* @return An aclTensor pointer created from the cached buffer.
908+
*/
909+
static aclTensor* get_f32_cache_acl_tensor(
910+
ggml_backend_cann_context& ctx,
911+
void** buffer,
912+
int64_t &cache_element,
913+
int64_t* ne,
914+
size_t* nb,
915+
int64_t dims,
916+
float value) {
917+
// Calculate total number of elements
918+
int64_t n_element = 1;
919+
for (int i = 0; i < dims; i++) {
920+
n_element *= ne[i];
921+
}
922+
size_t size = n_element * sizeof(float);
923+
924+
// Allocate or expand cache if needed
925+
if (cache_element < n_element) {
926+
if (*buffer != nullptr) {
927+
aclrtFree(*buffer);
928+
*buffer = nullptr;
929+
}
930+
931+
ACL_CHECK(aclrtMalloc(buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
932+
cache_element = n_element;
933+
934+
// Initialize cache
935+
if (value == 0.0f) {
936+
ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream()));
937+
} else {
938+
int64_t pool_ne[1] = { n_element };
939+
size_t pool_nb[1] = { sizeof(float) };
940+
aclTensor* acl_value = ggml_cann_create_tensor(
941+
*buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
942+
aclnn_fill_scalar(ctx, 1, acl_value);
943+
ggml_cann_release_resources(ctx, acl_value);
944+
}
945+
}
946+
947+
return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims);
948+
}
949+
870950
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
871951
ggml_tensor* src = dst->src[0];
872952

@@ -875,20 +955,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
875955

876956
float eps;
877957
memcpy(&eps, dst->op_params, sizeof(float));
878-
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
879-
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
880-
881-
aclTensor* acl_gamma = aclnn_values(
882-
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
883-
ggml_cann_type_mapping(src->type), ggml_element_size(src));
884-
885-
size_t zero_tensor_n_bytes =
886-
src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
887-
ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
888-
aclTensor* acl_rstd =
889-
aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
890-
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
891-
ggml_element_size(src));
958+
959+
// build gamma, one...
960+
size_t acl_gamma_nb[GGML_MAX_DIMS];
961+
acl_gamma_nb[0] = sizeof(float);
962+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
963+
acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
964+
}
965+
aclTensor* acl_gamma = get_f32_cache_acl_tensor(
966+
ctx,
967+
&ctx.f32_one_cache,
968+
ctx.f32_one_cache_element,
969+
src->ne,
970+
acl_gamma_nb,
971+
1, // dims
972+
1.0f // value
973+
);
974+
975+
// build rstd, zero...
976+
size_t acl_rstd_nb[GGML_MAX_DIMS];
977+
acl_rstd_nb[0] = sizeof(float);
978+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
979+
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
980+
}
981+
aclTensor* acl_rstd = get_f32_cache_acl_tensor(
982+
ctx,
983+
&ctx.f32_zero_cache,
984+
ctx.f32_zero_cache_element,
985+
src->ne,
986+
acl_rstd_nb,
987+
GGML_MAX_DIMS,
988+
0.0f // value
989+
);
990+
892991
GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
893992
ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
894993
}
@@ -903,14 +1002,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
9031002

9041003
const int n_past = ((int32_t*)dst->op_params)[0];
9051004

906-
size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
907-
src->ne[3] * ggml_element_size(src);
908-
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
1005+
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
1006+
void* buffer = one_tensor_allocator.get();
9091007

910-
aclTensor* mask_tensor =
911-
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
912-
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
913-
ggml_element_size(src), value);
1008+
aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
1009+
ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
1010+
1011+
aclnn_fill_scalar(ctx, value, mask_tensor);
9141012

9151013
aclScalar* alpha = nullptr;
9161014
float alphaValue = 1.0f;
@@ -1277,23 +1375,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
12771375
tmp_permute_tensor, tmp_mul_tensor, acl_dst);
12781376
}
12791377

1280-
/**
1281-
* @brief Fills a tensor with a scalar value.
1282-
*
1283-
* This function fills the destination tensor `acl_dst` with the scalar value
1284-
* `scalar`.
1285-
*
1286-
* @param ctx The context for the CANN backend operations.
1287-
* @param scalar The scalar value used to fill the tensor.
1288-
* @param acl_dst The destination tensor to be filled with the scalar value.
1289-
*/
1290-
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
1291-
aclTensor* acl_dst) {
1292-
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
1293-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
1294-
ggml_cann_release_resources(ctx, acl_scalar);
1295-
}
1296-
12971378
/**
12981379
* @brief Raises each element of a tensor to the power of the corresponding
12991380
* element in another tensor.

ggml/src/ggml-cann/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,10 @@ struct ggml_backend_cann_context {
379379
cann_task_queue task_queue;
380380
bool async_mode;
381381
bool support_set_rows;
382+
void* f32_zero_cache = nullptr;
383+
void* f32_one_cache = nullptr;
384+
int64_t f32_zero_cache_element = 0;
385+
int64_t f32_one_cache_element = 0;
382386

383387
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
384388

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,6 @@
150150
#elif defined(__s390x__)
151151
// quants.c
152152
#define quantize_row_q8_K_generic quantize_row_q8_K
153-
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
154-
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
155153
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
156154
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
157155
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K

0 commit comments

Comments
 (0)