Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
b9c3eef
CUDA: add bf16 and i32 to getrows (#14529)
am17an Jul 7, 2025
12f55c3
llama : remove ggml_cont where possible (#14568)
CISC Jul 7, 2025
e1a7059
llama : fix incorrect minicpm3 v_states shape (#14571)
CISC Jul 7, 2025
68155c6
musa: fix build warnings (unused variable) (#14561)
yeahdongcn Jul 7, 2025
75c91de
CUDA: add bilinear interpolation for upscale (#14563)
am17an Jul 8, 2025
4d0dcd4
cuda : fix rope with partial rotation and non-cont src (#14580)
ggerganov Jul 8, 2025
53903ae
vulkan: increase timeout for CI (#14574)
jeffbolznv Jul 8, 2025
8f22dc0
model : add hunyuan moe (#14425)
ngxson Jul 8, 2025
17a1f0d
server: Add ability to mount server at prefix (#14544)
oluwandabira Jul 8, 2025
b8eeb87
vulkan : fix rope with partial rotation and non-cont src (#14582)
jeffbolznv Jul 8, 2025
bb4f7a9
memory : fix broken batch splits for recurrent cache (#14575)
compilade Jul 8, 2025
0838286
model : add SmolLM3 (#14581)
ngxson Jul 8, 2025
699f439
model : fix hunyuan moe chat template (#14584)
stevenkuang-tencent Jul 8, 2025
6efcd65
vulkan: optimize flash attention split_k_reduce (#14554)
jeffbolznv Jul 8, 2025
20b7bf8
convert : fix smollm3 jinja template (#14586)
ngxson Jul 9, 2025
0465506
model : add support for Falcon-H1 family (#14534)
ibrahimkhadraoui Jul 9, 2025
1055545
llama : remove unintended whitespace (#14592)
CISC Jul 9, 2025
ffd59e7
model : add skt/A.X-4.0 model vocabulary (#14589)
Bing-su Jul 9, 2025
26a48ad
ggml : prevent integer overflow in gguf tensor size calculation (#14595)
Yuuoniy Jul 9, 2025
98bab63
ggml : add ggml_scale_bias (#14417)
ngxson Jul 9, 2025
4a5686d
llama : support Jamba hybrid Transformer-Mamba models (#7531)
compilade Jul 9, 2025
cb9178f
llama : remove llm_graph_input_one (#14603)
ngxson Jul 9, 2025
a57d1bc
cuda : support Falcon-H1 state size for SSM_SCAN (#14602)
compilade Jul 10, 2025
ac44eb6
cmake : llguidance build parser library only (#14608)
EZForever Jul 10, 2025
f9a867f
cmake : bump llguidance version to v1.0.1 (#14609)
EZForever Jul 10, 2025
435a6d1
llama : minor coding style fix for smollm3 (#14605)
ngxson Jul 10, 2025
704bb7a
SYCL: Initial set_rows kernel implementation (#14562)
qnixsynapse Jul 10, 2025
a457551
cmake : do not search for curl libraries by ourselves (#14613)
EZForever Jul 10, 2025
11ee0fe
Docs: script to auto-generate ggml operations docs (#14598)
am17an Jul 10, 2025
4bb625b
Smoldocling support (#14597)
ryan-mangeno Jul 10, 2025
0b88557
opencl: add `set_rows` for `f16` and `f32` (#14547)
lhez Jul 10, 2025
6bdda13
opencl: add tiled mul_mat_f16_f32 (#14535)
rmatif Jul 10, 2025
0aedae0
model : Granite Four (#13550)
gabe-l-hart Jul 11, 2025
576c82e
vocab : add midm-2.0 model pre-tokenizer (#14626)
Bing-su Jul 11, 2025
0d5375d
llama : move enum llama_vocab_pre_type to implementation (#14631)
ggerganov Jul 11, 2025
aaa088d
readme : add hot PRs (#14636)
ggerganov Jul 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ jobs:
cd build
export GGML_VK_VISIBLE_DEVICES=0
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
ctest -L main --verbose --timeout 4200

ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
Expand Down
40 changes: 40 additions & 0 deletions .github/workflows/update-ops-docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Update Operations Documentation

on:
push:
paths:
- 'docs/ops/**'
- 'scripts/create_ops_docs.py'
pull_request:
paths:
- 'docs/ops/**'
- 'scripts/create_ops_docs.py'

jobs:
update-ops-docs:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'

- name: Generate operations documentation to temporary file
run: |
mkdir -p /tmp/ops_check
./scripts/create_ops_docs.py /tmp/ops_check/ops.md

- name: Check if docs/ops.md matches generated version
run: |
if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
echo "Differences found:"
diff docs/ops.md /tmp/ops_check/ops.md || true
exit 1
fi
echo "Operations documentation is up to date."
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)

[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)

Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
LLM inference in C/C++

## Recent API changes

Expand All @@ -17,10 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

## Hot topics

- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
Expand Down
9 changes: 4 additions & 5 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ if (LLAMA_CURL)
endif()
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
endif ()

if (LLAMA_LLGUIDANCE)
Expand All @@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)

ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.7.20 (+ fix to build on GCC 15):
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
# v1.0.1:
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND cargo build --release
BUILD_COMMAND cargo build --release --package llguidance
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
UPDATE_COMMAND ""
Expand Down
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.public_path = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg(
{"--api-prefix"}, "PREFIX",
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
[](common_params & params, const std::string & value) {
params.api_prefix = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
{"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ struct common_params {

std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
std::string api_prefix = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
Expand Down
Loading
Loading