Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
cda16b7
CANN: Optimize RMS_NORM using cache (#15419)
noemotiovon Aug 22, 2025
3264759
readme : model : mtdm : lfm2 improvements (#15476)
tdakhran Aug 22, 2025
685b43e
server : Support multimodal completion and embeddings prompts in JSON…
65a Aug 22, 2025
6555340
ggml-cpu: Support Q5_0 and Q5_1 on s390x (#15486)
taronaeo Aug 22, 2025
613948c
llama : remove KV cache defragmentation logic (#15473)
ggerganov Aug 22, 2025
d73dfdf
cuda : add Pad Reflect 1D support (#14659)
YavorGIvanov Aug 22, 2025
0269314
ggml: add `conv3d` op (#15182)
rmatif Aug 22, 2025
95ad6b1
model : gpt-oss add response_format support (#15494)
aldehir Aug 22, 2025
2e72460
ggml WebGPU: add support for quantization types (#15440)
reeselevine Aug 22, 2025
33d4077
test-opt: allow slight inprecision (#15503)
JohannesGaessler Aug 22, 2025
39d8827
vulkan: optimize mul_mat_id loading row ids into shared memory (#15427)
jeffbolznv Aug 23, 2025
1f56a3c
vulkan : support ggml_mean (#15393)
Acly Aug 23, 2025
d2c8ec6
vulkan.Dockerfile: install vulkan SDK using tarball (#15282)
yeahdongcn Aug 23, 2025
14768e6
vulkan: Rewrite synchronization to allow some overlap between nodes (…
jeffbolznv Aug 23, 2025
3ee528a
chat : fix debug build assertion in trim function (#15520)
LaffeyNyaa Aug 23, 2025
8ea9f36
scripts: fix compare-llama-bench.py (#15521)
JohannesGaessler Aug 23, 2025
cb045ed
model : add support for Seed-OSS (#15490)
pwilkin Aug 23, 2025
79eeb46
vulkan: optimize rms_norm, and allow the work to spread across multip…
jeffbolznv Aug 23, 2025
17bbe29
CUDA: fix half2 -> half conversion for HIP (#15529)
JohannesGaessler Aug 23, 2025
56eb482
vulkan: workaround MoltenVK compile failure in multi_add (#15506)
jeffbolznv Aug 24, 2025
e3c01e6
vulkan: enable Conv2D for Apple after MoltenVK fixed the bug (#15526)
0cc4m Aug 24, 2025
24cd0d1
vulkan: Support FA with any multiple of 8 head sizes (#15537)
jeffbolznv Aug 24, 2025
0289568
kv-cache : support layer reuse (#15504)
ggerganov Aug 24, 2025
3fa428a
vulkan: apply MUL_MAT_ID subgroup optimization to non-coopmat devices…
0cc4m Aug 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions .devops/vulkan.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,30 @@ ARG UBUNTU_VERSION=24.04

FROM ubuntu:$UBUNTU_VERSION AS build

# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html

# Install Vulkan SDK and cURL
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt update -y && \
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget xz-utils

# Install Vulkan SDK
ARG VULKAN_VERSION=1.4.321.1
RUN ARCH=$(uname -m) && \
wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
mkdir -p /opt/vulkan && \
tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
mv /tmp/${ARCH}/* /opt/vulkan/ && \
rm -rf /tmp/*

# Install cURL and Vulkan SDK dependencies
RUN apt install -y libcurl4-openssl-dev curl \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev

# Set environment variables
ENV VULKAN_SDK=/opt/vulkan
ENV PATH=$VULKAN_SDK/bin:$PATH
ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH

# Build it
WORKDIR /app
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
- [x] [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)

</details>

Expand Down
6 changes: 4 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"-dt", "--defrag-thold"}, "N",
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
string_format("KV cache defragmentation threshold (DEPRECATED)"),
[](common_params & params, const std::string & value) {
params.defrag_thold = std::stof(value);
GGML_UNUSED(params);
GGML_UNUSED(value);
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
}
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
add_opt(common_arg(
Expand Down
22 changes: 21 additions & 1 deletion common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1361,6 +1361,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
"<|end|>",
};

if (!inputs.json_schema.is_null()) {
data.grammar_lazy = false;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
auto schema = inputs.json_schema;
builder.resolve_refs(schema);

auto not_end = builder.add_rule("not-end",
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
auto analysis = builder.add_rule("analysis",
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
auto final = builder.add_rule("final",
"\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
builder.add_schema("response", schema)
);

builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
});
}

if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
Expand Down Expand Up @@ -2121,7 +2141,7 @@ static common_chat_params common_chat_templates_apply_jinja(
}

// GPT-OSS
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
if (src.find("<|channel|>") != std::string::npos) {
return common_chat_params_init_gpt_oss(tmpl, params);
}

Expand Down
1 change: 0 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1152,7 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.attention_type = params.attention_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
Expand Down
1 change: 0 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ struct common_params {
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = 0.1f; // KV cache defragmentation threshold

// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
Expand Down
5 changes: 5 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5854,6 +5854,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return [(self.map_tensor_name(name), data_torch)]


@ModelBase.register("SeedOssForCausalLM")
class SeedOssModel(TextModel):
model_arch = gguf.MODEL_ARCH.SEED_OSS


@ModelBase.register("Olmo2ForCausalLM")
class Olmo2Model(TextModel):
model_arch = gguf.MODEL_ARCH.OLMO2
Expand Down
7 changes: 4 additions & 3 deletions docs/build-s390x.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,9 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ✅ | ❓ | ❓ |
| Q5_0 | 🚫 | 🚫 | ❓ | ❓ |
| Q5_1 | 🚫 | 🚫 | ❓ | ❓ |
| MXFP4 | 🚫 | 🚫 | ❓ | ❓ |
| Q5_0 | ✅ | ✅ | ❓ | ❓ |
| Q5_1 | ✅ | ✅ | ❓ | ❓ |
| Q8_0 | ✅ | ✅ | ❓ | ❓ |
| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
| Q3_K | ✅ | ✅ | ❓ | ❓ |
Expand All @@ -291,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself

Last Updated by **Aaron Teo ([email protected])** on July 31, 2025.
Last Updated by **Aaron Teo ([email protected])** on Aug 22, 2025.
2 changes: 1 addition & 1 deletion examples/llama.vim
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"
" start the llama.cpp server with a FIM-compatible model. for example:
"
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
"
" --batch-size [512, model max context]
"
Expand Down
18 changes: 18 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ extern "C" {
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D,
GGML_OP_CONV_3D,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
Expand Down Expand Up @@ -1940,6 +1941,23 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1

GGML_API struct ggml_tensor * ggml_conv_3d(
struct ggml_context * ctx,
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
struct ggml_tensor * b, // input [W, H, D, C * N]
int s0, // stride
int s1,
int s2,
int p0, // padding
int p1,
int p2,
int d0, // dilation
int d1,
int d2,
int n_channels,
int n_batch,
int n_channels_out);

enum ggml_op_pool {
GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG,
Expand Down
157 changes: 119 additions & 38 deletions ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,86 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
return acl_tensor;
}

/**
* @brief Fills a tensor with a scalar value.
*
* This function fills the destination tensor `acl_dst` with the scalar value
* `scalar`.
*
* @param ctx The context for the CANN backend operations.
* @param scalar The scalar value used to fill the tensor.
* @param acl_dst The destination tensor to be filled with the scalar value.
*/
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
aclTensor* acl_dst) {
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
ggml_cann_release_resources(ctx, acl_scalar);
}

/**
* @brief Get or expand a cached float32 tensor filled with a scalar value.
*
* This function manages cached device memory for float32 tensors. If the current
* cache size is insufficient for the requested tensor shape, the old memory will
* be released and new memory will be allocated. The allocated buffer is then
* initialized either with zeros (when @p value == 0.0f) or with the given scalar
* value using CANN operations. Finally, an aclTensor object is created from the
* cached memory and returned.
*
* @param ctx The CANN backend context that manages device memory.
* @param buffer A pointer to the cached device buffer (will be allocated
* or reallocated if necessary).
* @param cache_element The current number of cached elements. This will be
* updated when the cache is expanded.
* @param ne The tensor shape array (number of elements in each dimension).
* @param nb The stride size for each dimension.
* @param dims The number of tensor dimensions.
* @param value The scalar value used to fill the tensor (supports zero
* initialization via memset or arbitrary values via fill_scalar).
* @return An aclTensor pointer created from the cached buffer.
*/
static aclTensor* get_f32_cache_acl_tensor(
ggml_backend_cann_context& ctx,
void** buffer,
int64_t &cache_element,
int64_t* ne,
size_t* nb,
int64_t dims,
float value) {
// Calculate total number of elements
int64_t n_element = 1;
for (int i = 0; i < dims; i++) {
n_element *= ne[i];
}
size_t size = n_element * sizeof(float);

// Allocate or expand cache if needed
if (cache_element < n_element) {
if (*buffer != nullptr) {
aclrtFree(*buffer);
*buffer = nullptr;
}

ACL_CHECK(aclrtMalloc(buffer, size, ACL_MEM_MALLOC_HUGE_FIRST));
cache_element = n_element;

// Initialize cache
if (value == 0.0f) {
ACL_CHECK(aclrtMemsetAsync(*buffer, size, 0, size, ctx.stream()));
} else {
int64_t pool_ne[1] = { n_element };
size_t pool_nb[1] = { sizeof(float) };
aclTensor* acl_value = ggml_cann_create_tensor(
*buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
aclnn_fill_scalar(ctx, 1, acl_value);
ggml_cann_release_resources(ctx, acl_value);
}
}

return ggml_cann_create_tensor(*buffer, ACL_FLOAT, sizeof(float), ne, nb, dims);
}

void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];

Expand All @@ -875,20 +955,39 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

float eps;
memcpy(&eps, dst->op_params, sizeof(float));
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);

aclTensor* acl_gamma = aclnn_values(
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
ggml_cann_type_mapping(src->type), ggml_element_size(src));

size_t zero_tensor_n_bytes =
src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
aclTensor* acl_rstd =
aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes,
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src));

// build gamma, one...
size_t acl_gamma_nb[GGML_MAX_DIMS];
acl_gamma_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
}
aclTensor* acl_gamma = get_f32_cache_acl_tensor(
ctx,
&ctx.f32_one_cache,
ctx.f32_one_cache_element,
src->ne,
acl_gamma_nb,
1, // dims
1.0f // value
);

// build rstd, zero...
size_t acl_rstd_nb[GGML_MAX_DIMS];
acl_rstd_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
}
aclTensor* acl_rstd = get_f32_cache_acl_tensor(
ctx,
&ctx.f32_zero_cache,
ctx.f32_zero_cache_element,
src->ne,
acl_rstd_nb,
GGML_MAX_DIMS,
0.0f // value
);

GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
}
Expand All @@ -903,14 +1002,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,

const int n_past = ((int32_t*)dst->op_params)[0];

size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
src->ne[3] * ggml_element_size(src);
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
void* buffer = one_tensor_allocator.get();

aclTensor* mask_tensor =
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src), value);
aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);

aclnn_fill_scalar(ctx, value, mask_tensor);

aclScalar* alpha = nullptr;
float alphaValue = 1.0f;
Expand Down Expand Up @@ -1277,23 +1375,6 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
tmp_permute_tensor, tmp_mul_tensor, acl_dst);
}

/**
* @brief Fills a tensor with a scalar value.
*
* This function fills the destination tensor `acl_dst` with the scalar value
* `scalar`.
*
* @param ctx The context for the CANN backend operations.
* @param scalar The scalar value used to fill the tensor.
* @param acl_dst The destination tensor to be filled with the scalar value.
*/
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
aclTensor* acl_dst) {
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
ggml_cann_release_resources(ctx, acl_scalar);
}

/**
* @brief Raises each element of a tensor to the power of the corresponding
* element in another tensor.
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cann/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,10 @@ struct ggml_backend_cann_context {
cann_task_queue task_queue;
bool async_mode;
bool support_set_rows;
void* f32_zero_cache = nullptr;
void* f32_one_cache = nullptr;
int64_t f32_zero_cache_element = 0;
int64_t f32_one_cache_element = 0;

aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */

Expand Down
2 changes: 0 additions & 2 deletions ggml/src/ggml-cpu/arch-fallback.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,6 @@
#elif defined(__s390x__)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
Expand Down
Loading
Loading