Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
1018 commits
Select commit Hold shift + click to select a range
879dec3
ggml-cpu : use template for argsort (#17222)
slaren Nov 13, 2025
2776db6
Revert "ggml-cpu: handle 3d tensors in repack mat_mul (#17030)" (#17233)
ggerganov Nov 13, 2025
0cfb191
metal: accelerated conv2d (#17175)
bghira Nov 13, 2025
1215dde
ggml-cpu : add RISC-V vector intrinsic support for silu and cvar oper…
ixgbe Nov 13, 2025
dd091e5
sched : fix reserve ignoring user tensor assignments (#17232)
slaren Nov 13, 2025
a19bd6f
vulkan: remove shell call from vulkan-shaders-gen tool, revert file c…
0cc4m Nov 13, 2025
389ac78
ggml : add ops SOFTPLUS, EXPM1, TRI, SOLVE_TRI, CUMSUM (#17063)
pwilkin Nov 13, 2025
c4abcb2
server: fixing naming conflict res_error (#17243)
ngxson Nov 13, 2025
becc481
ggml-cpu: handle 3d tensors in repack mat_mul (#17241)
Alcpz Nov 13, 2025
f1bad23
Better UX for handling multiple attachments in WebUI (#17246)
allozaur Nov 14, 2025
307772f
readme : add RVV,ZVFH,ZFH,ZICBOP support for RISC-V (#17259)
ixgbe Nov 14, 2025
2606b0a
metal : make the FA extra sizes consistent (#17143)
ggerganov Nov 14, 2025
45c6ef7
metal : support argsort for ne00 > 1024 (#17247)
ggerganov Nov 14, 2025
d396b43
server : fix "can batch with" bug (#17263)
ggerganov Nov 14, 2025
6cd0cf7
fix : Dangling pointer for non-empty trigger words in lazy grammar co…
marek-hradil Nov 14, 2025
e1fcf8b
model : add AfmoeForCausalLM support (#16477)
bartowski1182 Nov 14, 2025
9b17d74
mtmd: add mtmd_log_set (#17268)
ngxson Nov 14, 2025
38eaf32
vulkan: change graph_compute to be async and enable get_tensor_async …
jeffbolznv Nov 15, 2025
234ae7d
vulkan: skip all-negative-inf blocks in FA (#17186)
jeffbolznv Nov 15, 2025
439342e
vulkan: Use ggml_vk_tensor_subbuffer in mul_mat_vec(id) paths (#17244)
jeffbolznv Nov 15, 2025
1568d13
vulkan: implement ABS and NEG (#17245)
giuseppe Nov 15, 2025
c7b7db0
mtmd-cli: Avoid logging to stdout for model loading messages in mtmd-…
ankurvdev Nov 15, 2025
9d3ef48
convert : set expert gating func in base class (#17279)
CISC Nov 15, 2025
9a8860c
convert : use all parts in safetensors index (#17286)
CISC Nov 15, 2025
4dca015
vulkan: Replace 16-bit unpack8 calls to work around legacy Windows AM…
0cc4m Nov 15, 2025
24dc769
vulkan: Fuse mul_mat_id+add_id+mul and mul_mat+add+add. (#17287)
jeffbolznv Nov 15, 2025
662192e
convert : remove unnecessary chat template patching (#17289)
CISC Nov 15, 2025
1411d92
webui: add OAI-Compat Harmony tool-call streaming visualization and p…
ServeurpersoCom Nov 15, 2025
22e1ce2
webui: Fix clickability around chat processing statistics UI (#17278)
allozaur Nov 15, 2025
72bd732
sycl : unify unary kernels with a generic implementation and enable w…
shani-f Nov 15, 2025
4db5641
opencl: add kernel to handle mat mul in attention to improve encoding…
shaofeiqi Nov 16, 2025
52e5d42
opencl: fix rms_norm_mul (#17250)
lhez Nov 16, 2025
5b2093b
server : handle context overflow during decode (#17267)
ggerganov Nov 16, 2025
416e7c7
metal : remove obosolete asserts (#17295)
ggerganov Nov 16, 2025
8b1c339
ci : revert #16249 (#17303)
netrunnereve Nov 16, 2025
80deff3
vulkan: fix MMQ quantize_y condition (#17301)
0cc4m Nov 16, 2025
dbed612
vulkan: add LOG operation support for F32 and F16 (#17183)
zayac Nov 16, 2025
2376b77
CANN: Use smart pointers to manage ACL objects (#17238)
hipudding Nov 17, 2025
1a13964
metal : add cumsum (#17305)
ggerganov Nov 17, 2025
3347e6d
metal : faster argsort (#17315)
ggerganov Nov 17, 2025
7aaeedc
metal : support I32 -> I32 copy (#17317)
ggerganov Nov 17, 2025
cb623de
ggml : add missing AVX512 feature checks (#17270)
angt Nov 17, 2025
cb44fc8
cmake : fix ARM feature verification (#17170)
angt Nov 17, 2025
38e2c1b
vulkan: add log RTE support to fix Nvidia CI (#17320)
0cc4m Nov 17, 2025
0de8878
server: split HTTP into its own interface (#17216)
ngxson Nov 17, 2025
da95bf2
vulkan: support noncontig i32 copy (#17328)
jeffbolznv Nov 18, 2025
ffa277a
CANN: Add openEuler-cann in build and release (#17192)
xuedinge233 Nov 18, 2025
97cb3fd
fix: resolve undefined variable 'svr' compilation error (#17348)
o7si Nov 18, 2025
bc4064c
CANN: fix acl_tensor_ptr usage in ASCEND_310P ROPE (#17347)
noemotiovon Nov 18, 2025
f40a2e5
gitignore : be more specific about ignored stuff (#17354)
ggerganov Nov 18, 2025
561a3e2
ci : change the openEuler-310p image to fix release (#17361)
xuedinge233 Nov 18, 2025
1920345
common : Generalized XML-style tool-call parsing with streaming suppo…
hksdpc255 Nov 18, 2025
a045492
vocab : call reserve() for building plamo-2-translate suffix (#17343)
haiyuewa Nov 18, 2025
10e9780
chat: fix int overflow, prevent size calculation in float/double (#17…
ngxson Nov 18, 2025
c49daff
ggml-cpu: Don't pass -mpowerpc64 when -mcpu already implies it (#17308)
JeremyRand Nov 19, 2025
980b7cd
vulkan: force full subgroups for flash attention to fix intel subgrou…
0cc4m Nov 19, 2025
6fd4f95
Fix too relaxed check on CUDA "fast copy" (can_be_transposed) conditi…
pwilkin Nov 19, 2025
fd7353d
cuda: fix rope fusion for gemma3 (#17378)
am17an Nov 19, 2025
07b0e7a
convert : use self.block_count everywhere instead of reading hparams …
CISC Nov 19, 2025
99c53d6
webui: Add a "Continue" Action for Assistant Message (#16971)
allozaur Nov 19, 2025
2eba631
vulkan: Add copy_transpose shader (#17371)
jeffbolznv Nov 19, 2025
1fa4551
vulkan: support larger argsort (#17313)
jeffbolznv Nov 19, 2025
7d77f07
vulkan: implement ADD1, ARANGE, FILL, SOFTPLUS, STEP, ROUND, CEIL, FL…
giuseppe Nov 19, 2025
5be353e
ggml-cpu:add RISC-V RVV (Zvfh) optimization for FP16 vector scaling (…
ixgbe Nov 20, 2025
3ae282a
kleidiai: fix zero-size array declaration (#17240)
sudhiarm Nov 20, 2025
79bb743
ggml : remove useless and error-prone variadic macros (#17399)
angt Nov 20, 2025
a7784a8
DGX Spark: UMA support (#17368)
sfudally-nvidia Nov 20, 2025
845f200
ggml : Fix transposed SOLVE_TRI result (#17323)
pwilkin Nov 20, 2025
5088b43
convert : fix TypeError when loading base model remotely in convert_l…
o7si Nov 20, 2025
196f508
common : more accurate sampling timing (#17382)
ggerganov Nov 20, 2025
1d321e5
metal : fix compile on macos 11 (whisper/3533)
smilingpoplar Nov 20, 2025
2286a36
sync : ggml
ggerganov Nov 20, 2025
92c0b38
grammar : fix integer overflow (#17381)
pwilkin Nov 20, 2025
4c91f26
Improved file naming & structure for UI components (#17405)
allozaur Nov 20, 2025
054a45c
grammar: fix regression caused by #17381 (#17412)
ngxson Nov 20, 2025
dd0f321
readme : add Unsloth exporting to GGUF in tools (#17411)
danielhanchen Nov 20, 2025
21d31e0
ggml-hexagon: fix swiglu failure at `test-backend-ops` (#17344)
chraac Nov 20, 2025
2370665
CANN: Refactor `evaluate_and_capture_cann_graph` (#17333)
rauletorresc Nov 21, 2025
f1ffbba
vulkan: disable async for older Intel devices (#17369)
jeffbolznv Nov 21, 2025
9cc4080
ci : start using OpenSSL (#17235)
angt Nov 21, 2025
28175f8
cmake : add option to build and link BoringSSL (#17205)
angt Nov 21, 2025
23bc779
model : detect GigaChat3-10-A1.8B as deepseek lite (#17420)
ubergarm Nov 21, 2025
8e9ddba
opencl: refine condition for kqv mm (#17392)
lhez Nov 21, 2025
028f93e
HIP: RDNA4 tensor core support for MMF (#17077)
zhang-hui-yulo Nov 21, 2025
3f3a4fb
Revive MUL_MAT_ID to perf testing (#17397)
rillomas Nov 22, 2025
4949ac0
ci : switch to BoringSSL on Server workflow (#17441)
angt Nov 22, 2025
54d83bb
vulkan: remove a couple unnecessary switches (#17419)
jeffbolznv Nov 23, 2025
bc809e9
vulkan: Update docker image to Ubuntu 26.04 to enable glslc features …
ericcurtin Nov 23, 2025
96ac5a2
cuda : support non-contiguous i32 to i32 copy (#17326)
CISC Nov 23, 2025
0c7220d
webui: minor settings reorganization and add disable autoscroll optio…
ServeurpersoCom Nov 23, 2025
d5bc1ad
ggml-hexagon: add `hex_supported_buffer` for better buffer supported …
chraac Nov 23, 2025
fcb0138
ggml-hexagon: Initial Hexagon v68/v69 support (#17394)
mediouni-m Nov 24, 2025
01ad35e
CANN: Define `cann_graph_update_required` before macro (#17434)
rauletorresc Nov 24, 2025
923ae3c
hexagon: add support for ROPE_NEOX (#17458)
max-krasnyansky Nov 24, 2025
4902eeb
models : Added support for RND1 Diffusion Language Model (#17433)
wp4032 Nov 24, 2025
5f55c38
ggml: add RISC-V cpu-feats (#17461)
ixgbe Nov 24, 2025
dbb852b
ggml-cpu: arm64: q4_K repack gemm and gemv implementations (i8mm) (#1…
Alcpz Nov 24, 2025
697edfe
ggml : remove dirty flag from version string (ggml/1391)
danbev Nov 24, 2025
2d50b9d
sync : ggml
ggerganov Nov 24, 2025
6ab8eac
examples : add -kvu to batched usage example [no ci] (#17469)
danbev Nov 24, 2025
b8372ee
server: split server.cpp code into server/common/task/queue (#17362)
ngxson Nov 24, 2025
b61de2b
convert : allow quantizing lora again (#17453)
CISC Nov 24, 2025
0543f92
HIP: WMMA-MMQ kernels for RDNA 4 (#17156)
jiachengjason Nov 24, 2025
134e694
llama : skip output reordering for single token batches (#17466)
danbev Nov 24, 2025
3d07caa
vulkan: more FA details in vk_perf_logger (#17443)
jeffbolznv Nov 24, 2025
877566d
llama: introduce support for model-embedded sampling parameters (#17120)
taronaeo Nov 25, 2025
d414db0
vulkan: Use fewer rows for scalar FA when HS is not a multiple of 16 …
jeffbolznv Nov 25, 2025
b1846f1
webui: add rehype plugin to restore HTML in Markdown table cells (#17…
ServeurpersoCom Nov 25, 2025
064c90d
CANN: supports out_prod operator for F32 and F16 (#17406)
TianHao324 Nov 25, 2025
55ab25c
codeowners : remove slaren (#17492)
slaren Nov 25, 2025
05872ac
convert : fix big-endian conversion (#17431)
AlekseiNikiforovIBM Nov 25, 2025
583cb83
ggml : add ggml_top_k (#17365)
ggerganov Nov 25, 2025
b3b03a7
vulkan: Implement GGML_OP_CUMSUM (#17479)
jeffbolznv Nov 26, 2025
f3a848a
chore: upgrade cpp-httplib from v0.27.0 to v0.28.0 (#17513)
o7si Nov 26, 2025
eeb5605
CANN: Add MROPE and IMROPE support (#17401)
hipudding Nov 26, 2025
3e18dba
HIP: Patch failed testcase in WMMA-MMQ kernels for RDNA 4 (#17502)
jiachengjason Nov 26, 2025
e6923ca
ggml : fix ARM feature verification (#17519)
angt Nov 26, 2025
2336cc4
cmake : use EXCLUDE_FROM_ALL to avoid patch-boringssl.cmake (#17520)
angt Nov 26, 2025
6ab4e50
ggml-cpu : add RISC-V Zvfh impl for ggml_vec_mad_f16 (#17448)
xctan Nov 26, 2025
879d673
vulkan: Implement top-k (#17418)
jeffbolznv Nov 26, 2025
eec1e33
vulkan: allow graph_optimize for prompt processing workloads (#17475)
jeffbolznv Nov 26, 2025
1d594c2
clip: (minicpmv) fix resampler kq_scale (#17516)
hNSBQZ Nov 26, 2025
5449367
Fix chunks being too small with small matrix sizes (#17526)
Alcpz Nov 26, 2025
7cba58b
opencl: add sqr, sqrt, mean and ssm_conv (#17476)
lhez Nov 26, 2025
e509411
server: enable jinja by default, update docs (#17524)
ngxson Nov 27, 2025
142df17
vulkan: use a fixed 1KB buffer for the add_rms_fusion opt (#17514)
jeffbolznv Nov 27, 2025
b78db3b
vulkan : move contiguous checks to device_supports_op (#17490)
Acly Nov 27, 2025
4fcd87c
gguf-py : skip endian-conversion of MXFP4 data (#17523)
AlekseiNikiforovIBM Nov 27, 2025
d21a76a
devops: Add build-essential to Ubuntu 26.04 image (#17531)
ericcurtin Nov 27, 2025
cd8370b
ggml-cpu: aarm64: q4_K repack gemm and gemv implementations (dotprod …
Alcpz Nov 27, 2025
909072a
cuda : fix UMA detection on discrete GPUs. (#17537)
matt23654 Nov 27, 2025
6783b11
models : fix LFM2 tensors (#17548)
ggerganov Nov 27, 2025
c386114
arch : add description about LLM_TENSOR_INFOS (#17550)
ggerganov Nov 27, 2025
4abef75
vulkan: Implement SOLVE_TRI (#17486)
jeffbolznv Nov 27, 2025
efaaccd
refactor pad_reflect_1d to make the UT case pass (#17204)
NeoZhangJianyu Nov 28, 2025
cd0e3a7
SOLVE_TRI CUDA kernel for small matrices (#17457)
pwilkin Nov 28, 2025
6bca76f
HIP: enable mul_mat_f for RDNA4 (#17437)
zhang-hui-yulo Nov 28, 2025
15d2b46
rpc : cache and reuse compute graphs (#15405)
rgerganov Nov 28, 2025
35cf888
vulkan: Implement GGML_OP_TRI (#17503)
jeffbolznv Nov 28, 2025
73955f7
CUDA: no FP16 arithmetic for vector FA kernel (#17558)
JohannesGaessler Nov 28, 2025
ff55414
model : Qwen3 Next (#16095)
pwilkin Nov 28, 2025
ddf9f94
server : add Anthropic Messages API support (#17570)
noname22 Nov 28, 2025
2e7ef98
ggml-cuda: add stricter checking for fusion (#17568)
am17an Nov 28, 2025
c6f7a42
[MUSA] enable fp16/fast_fp16/bf16_mma on PH1 (#17551)
yeahdongcn Nov 28, 2025
e072b20
ggml : add GGML_SCHED_NO_REALLOC option to disable reallocations in g…
slaren Nov 28, 2025
3ce7a65
server: fix: /metrics endpoint returning JSON-escaped Prometheus form…
o7si Nov 28, 2025
03914c7
common : move all common_chat_parse_* to chat-parser.cpp. (#17481)
dranger003 Nov 28, 2025
d82b7a7
gguf-py : fix passing non-native endian tensors (editor-gui and new-m…
AlekseiNikiforovIBM Nov 28, 2025
59d8d4e
vulkan: improve topk perf for large k, fix overflow in unit tests (#1…
jeffbolznv Nov 29, 2025
47a268e
Vulkan: MMVQ Integer Dot K-Quant and MUL_MAT_ID support (#16900)
0cc4m Nov 29, 2025
f698a79
ggml: replace hwcap with riscv_hwprobe for RVV detection (#17567)
ixgbe Nov 29, 2025
7d2add5
sycl : support to malloc memory on device more than 4GB, update the d…
arthw Nov 29, 2025
0874693
common : fix json schema with '\' in literals (#17307)
i-v-s Nov 29, 2025
8c32d9d
server: explicitly set the function name in lambda (#17538)
haiyuewa Nov 29, 2025
ab49f09
server: move server-context to its own cpp|h (#17595)
ngxson Nov 29, 2025
385c3da
vulkan : fix FA mask load with bounds check (coopmat2) (#17606)
Acly Nov 30, 2025
00425e2
cuda : add error checking for cudaMemcpyAsync in argsort (#17599)
Mahekk357 Nov 30, 2025
c7af376
CUDA: add stream-based concurrency (#16991)
am17an Nov 30, 2025
5a6241f
common: update env var name (#17588)
ddh0 Nov 30, 2025
fa04659
ggml: fix: macOS build with `-DGGML_BACKEND_DL=ON` (#17581)
giladgd Nov 30, 2025
def5404
common: add LLAMA_LOG_FILE env var (#17609)
taronaeo Nov 30, 2025
beb1f0c
common : throttle download progress output to reduce IO flush (#17427)
angt Nov 30, 2025
3c136b2
cli: add migration warning (#17620)
ngxson Nov 30, 2025
7f8ef50
clip: fix nb calculation for qwen3-vl (#17594)
ngxson Nov 30, 2025
2ba7195
model: LFM2-VL fixes (#17577)
tdakhran Nov 30, 2025
0a4aeb9
cmake : add option to build and link LibreSSL (#17552)
angt Nov 30, 2025
ff90508
contributing: update guidelines for AI-generated code (#17625)
ngxson Nov 30, 2025
6eea666
llama-graph: avoid expand_forward for fusion (#17633)
am17an Dec 1, 2025
90c72a6
ggml : extend the GGML_SCHED_NO_REALLOC debug logic of the scheduler …
ggerganov Dec 1, 2025
649495c
metal : add FA head size 48 (#17619)
ggerganov Dec 1, 2025
cd3c118
model: support Ministral3 (#17644)
ngxson Dec 1, 2025
7733409
common: improve verbosity level definitions (#17630)
ngxson Dec 1, 2025
ec18edf
server: introduce API for serving / loading / unloading multiple mode…
ngxson Dec 1, 2025
00c361f
fix: llama arch implementation (#17665)
giladgd Dec 1, 2025
ecf74a8
mtmd: add mtmd_context_params::warmup option (#17652)
ngxson Dec 1, 2025
9810cb8
ops.md: update vulkan support (#17661)
jeffbolznv Dec 1, 2025
746f9ee
Override SSM_A op for Qwen3 Next to reduce splits (#17587)
pwilkin Dec 1, 2025
98bd9ab
enhance argsort for UT (#17573)
NeoZhangJianyu Dec 2, 2025
7b6d745
release: fix duplicate libs, store symbolic links (#17299)
taronaeo Dec 2, 2025
ed32089
ggml-cuda: reorder only relevant nodes (#17639)
am17an Dec 2, 2025
cee92af
Add context info to server error (#17663)
allozaur Dec 2, 2025
ab6726e
ggml : add fallback definition for HWCAP2_SVE2 (#17683)
angt Dec 2, 2025
4574f29
ci : skip winget update when not in ggml-org (#17465)
angt Dec 2, 2025
682e665
server: explicitly set exec path when create new instance (#17669)
ngxson Dec 2, 2025
fd3abe8
server: fixing naming conflict res_error in server-models.cpp (#17679)
w169q169 Dec 2, 2025
5d6bd84
server: remove default "gpt-3.5-turbo" model name (#17668)
ngxson Dec 2, 2025
2c453c6
convert: add error message for mistral3 quantized weight (#17686)
ngxson Dec 2, 2025
f3a9674
llama : fix signed comparison warning on FreeBSD (#17497)
angt Dec 2, 2025
b9a3771
codeowners : remove ericcurtin (#17658)
ericcurtin Dec 2, 2025
7f3a72a
ggml : remove redundant n_copies check when setting input/output (#17…
danbev Dec 2, 2025
a2b0fe8
CANN: Disable Ger operator of OUT_PROD on 310p device (#17563)
TianHao324 Dec 2, 2025
e148380
ggml : use svcntb() for SVE vector length detection (#17474)
angt Dec 2, 2025
c4357dc
Server: Change Invalid Schema from Server Error (500) to User Error (…
chadvoegele Dec 2, 2025
e251e5e
cmake : add utf8 compilation options for msvc (#17682)
xiaobing318 Dec 2, 2025
61bde8e
vulkan: Reduce temporary memory usage for TOP_K (#17623)
jeffbolznv Dec 2, 2025
4eba8d9
ci : RVV1.0 builds with tests (#16682)
alitariq4589 Dec 2, 2025
a96283a
mtmd: fix --no-warmup (#17695)
ngxson Dec 2, 2025
13628d8
server: add --media-path for local media files (#17697)
ngxson Dec 2, 2025
16cc3c6
build: document how to compile with Vulkan using Debian/Ubuntu packag…
socram8888 Dec 3, 2025
37adc9c
ggml, llama : use defaulted constructors/destructors (#17649)
GermanAizek Dec 3, 2025
b3e3060
ci : move release details to the top visible by default (#17719)
CISC Dec 3, 2025
7ca5991
ggml webgpu: add support for emscripten builds (#17184)
reeselevine Dec 3, 2025
5ceed62
server: fix duplicate HTTP headers in multiple models mode (#17698)
ServeurpersoCom Dec 3, 2025
0a8026e
common : introduce composable PEG parser combinators for chat parsing…
aldehir Dec 3, 2025
7feb0a1
ci : remove the build of openeuler-cann in release (#17724)
xuedinge233 Dec 3, 2025
3d94e96
metal : fix data race in pipeline library (#17731)
ggerganov Dec 3, 2025
083e18b
cmake: explicitly link against crypt32 on non-MSVC Windows builds (#1…
angt Dec 3, 2025
1257491
server : fix bad fmt, size() is a size_type (#17735)
angt Dec 3, 2025
e7c2cf1
server: add router multi-model tests (#17704) (#17722)
ServeurpersoCom Dec 3, 2025
190c483
chat : reserve memory in compute_diffs and improve naming (#17729)
ggerganov Dec 3, 2025
2e1c9cd
CUDA: generalized (mma) FA, add Volta support (#17505)
JohannesGaessler Dec 3, 2025
41c5e02
webui: Fix zero pasteLongTextToFileLen to disable conversion being ov…
awasisto Dec 3, 2025
e9f9483
Use OpenAI-compatible `/v1/models` endpoint by default (#17689)
allozaur Dec 3, 2025
424c579
convert : support latest mistral-common (fix conversion with --mistra…
SmartestWashingMachine Dec 3, 2025
c6d1a00
Add a couple of file types to the text section (#17670)
pwilkin Dec 3, 2025
dea9ba2
ggml-cpu: remove duplicate conditional check 'iid' (#17650)
GermanAizek Dec 3, 2025
d8b5cdc
build: enable parallel builds in msbuild using MTT (#17708)
jeffbolznv Dec 4, 2025
ef75a89
build : move _WIN32_WINNT definition to headers (#17736)
angt Dec 4, 2025
a67ef0f
llama : fix sanity checks during quantization (#17721)
ggerganov Dec 4, 2025
0d13248
metal : use params per pipeline instance (#17739)
ggerganov Dec 4, 2025
83c1171
common: use native MultiByteToWideChar (#17738)
angt Dec 4, 2025
7dba049
ci : disable ggml-ci-x64-amd-* (#17753)
CISC Dec 4, 2025
2a73f81
cmake : simplify build info detection using standard variables (#17423)
angt Dec 4, 2025
3659aa2
convert: use existing local chat_template if mistral-format model has…
SmartestWashingMachine Dec 4, 2025
87a2084
ggml-cpu : remove asserts always evaluating to false (#17728)
Alcpz Dec 4, 2025
bd4ef13
common : skip model validation when --help is requested (#17755)
danbev Dec 4, 2025
817d743
examples : add missing code block end marker [no ci] (#17756)
danbev Dec 4, 2025
c4c10bf
server: move msg diffs tracking to HTTP thread (#17740)
ngxson Dec 4, 2025
9d02299
server: strip content-length header on proxy (#17734)
ngxson Dec 4, 2025
bde188d
metal: TRI, FILL, EXPM1, SOFTPLUS (#16623)
gabe-l-hart Dec 4, 2025
96fe9ba
Add support for CUMSUM and TRI for CUDA. (#17584)
pwilkin Dec 4, 2025
3143a75
docs : update ops.md (Metal, BLAS) (#17768)
gabe-l-hart Dec 4, 2025
03d9a77
ci : transform release binary root dir in tar to llama-bXXXX (#17773)
CISC Dec 5, 2025
668ed76
HIP: enable WMMA-MMQ INT kernels for RDNA 3 (#17576)
jiachengjason Dec 5, 2025
e95d0bc
CUDA: fix FA VKQ accumulator overflow (#17746)
JohannesGaessler Dec 5, 2025
6648989
Add pwilkin to CODEOWNERS for chat files (#17789)
pwilkin Dec 5, 2025
3a0d105
Q4/Q8 Tiled Gemm Optimization. (#16999)
shalinib-ibm Dec 5, 2025
a6cfc21
ci : fix winget workflow (#17790)
angt Dec 5, 2025
1be9783
fix: prevent segfault in tokenizer on highly repetitive input (#17786)
ServeurpersoCom Dec 5, 2025
6016d0b
HIP : fix RDNA4 build (#17792)
JohannesGaessler Dec 5, 2025
c41bde6
metal : add residency sets keep-alive heartbeat (#17766)
ggerganov Dec 5, 2025
8160b38
rpc : fix alloc size logic (#17116)
ggerganov Dec 5, 2025
93bb926
vulkan: set all memory allocations to high priority (#17624)
jeffbolznv Dec 5, 2025
6ab0d64
vulkan: enable mmvq for q2_k on NVIDIA (#17675)
jeffbolznv Dec 5, 2025
fd57b24
ggml webgpu: unary op suppport, code refactoring, ops support (#17764)
reeselevine Dec 5, 2025
e15cd06
vulkan : support conv-2d with large output size (#17685)
Acly Dec 5, 2025
a0f3897
vulkan: fix top_k bug when there are ties in the input (#17659)
jeffbolznv Dec 5, 2025
933414c
vulkan: add more num_blocks instantiations in rms_norm (#17701)
jeffbolznv Dec 5, 2025
a341f3c
support gated linear attn
Oct 31, 2025
c69e73f
fix case for GGML_OP_GATED_LINEAR_ATTN
YushengZhao Dec 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 8 additions & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,14 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: false
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
AttributeMacros:
- __host__
- __device__
- __global__
- __forceinline__
- __launch_bounds__
BinPackArguments: true
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
Expand Down
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Checks: >
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
-performance-enum-size,
portability-*,
-portability-simd-intrinsics,
misc-*,
Expand Down
13 changes: 6 additions & 7 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,15 @@
# ==============================================================================

# Define the CANN base image for easier version updates later
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11

# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build

# Define the Ascend chip model for compilation. Default is Ascend910B3
ARG ASCEND_SOC_TYPE=Ascend910B3

# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
Expand All @@ -36,20 +34,21 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# For brevity, only core variables are listed here. You can paste the original ENV list here.

# -- Build llama.cpp --
# Use the passed ASCEND_SOC_TYPE argument and add general build options
# Use the passed CHIP_TYPE argument and add general build options
ARG CHIP_TYPE
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
-DSOC_TYPE=ascend${CHIP_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)

# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
Expand Down
2 changes: 1 addition & 1 deletion .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
cmake --build build -j $(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
2 changes: 1 addition & 1 deletion .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
8 changes: 4 additions & 4 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04

## Build Image

FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
Expand All @@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand All @@ -31,7 +31,7 @@ RUN mkdir -p /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
Expand Down
4 changes: 2 additions & 2 deletions .devops/musa.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.2.0
ARG MUSA_VERSION=rc4.3.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

Expand Down Expand Up @@ -32,7 +32,7 @@ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
6 changes: 2 additions & 4 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false,
useRpc ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

# It's necessary to consistently use backendStdenv when building with CUDA support,
Expand Down Expand Up @@ -128,10 +129,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
};

postPatch = ''
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
'';

# With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
Expand Down Expand Up @@ -179,6 +176,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
(cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic)
(cmakeBool "GGML_RPC" useRpc)
]
++ optionals useCuda [
(
Expand Down
29 changes: 15 additions & 14 deletions .devops/rocm.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
ARG UBUNTU_VERSION=24.04

# This needs to generally match the container host's environment.
ARG ROCM_VERSION=6.4
ARG AMDGPU_VERSION=6.4
ARG ROCM_VERSION=7.0
ARG AMDGPU_VERSION=7.0

# Target the CUDA build image
# Target the ROCm build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

### Build image
Expand All @@ -13,18 +13,14 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
# gfx906 is deprecated
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html

ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
#ARG ROCM_DOCKER_ARCH=gfx1100
ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
#ARG ROCM_DOCKER_ARCH='gfx1151'

# Set nvcc architectured
# Set ROCm architectures
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
# ENV CC=/opt/rocm/llvm/bin/clang
# ENV CXX=/opt/rocm/llvm/bin/clang++

RUN apt-get update \
&& apt-get install -y \
Expand All @@ -40,11 +36,16 @@ WORKDIR /app
COPY . .

RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
cmake -S . -B build \
-DGGML_HIP=ON \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
-DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
&& cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib \
&& find build -name "*.so" -exec cp {} /app/lib \;
&& find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
126 changes: 126 additions & 0 deletions .devops/s390x.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
ARG GCC_VERSION=15.2.0
ARG UBUNTU_VERSION=24.04

### Build Llama.cpp stage
FROM gcc:${GCC_VERSION} AS build

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt upgrade -y && \
apt install -y --no-install-recommends \
git cmake ccache ninja-build \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
libopenblas-dev libcurl4-openssl-dev && \
rm -rf /var/lib/apt/lists/*

WORKDIR /app
COPY . .

RUN --mount=type=cache,target=/root/.ccache \
--mount=type=cache,target=/app/build \
cmake -S . -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_NATIVE=OFF \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS && \
cmake --build build --config Release -j $(nproc) && \
cmake --install build --prefix /opt/llama.cpp

COPY *.py /opt/llama.cpp/bin
COPY .devops/tools.sh /opt/llama.cpp/bin

COPY gguf-py /opt/llama.cpp/gguf-py
COPY requirements.txt /opt/llama.cpp/gguf-py
COPY requirements /opt/llama.cpp/gguf-py/requirements


### Collect all llama.cpp binaries, libraries and distro libraries
FROM scratch AS collector

# Copy llama.cpp binaries and libraries
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py


### Base image
FROM ubuntu:${UBUNTU_VERSION} AS base

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt install -y --no-install-recommends \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
# See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
curl libgomp1 libopenblas-dev && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

# Copy llama.cpp libraries
COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu


### Full
FROM base AS full

ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /app

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
apt update -y && \
apt install -y \
git cmake libjpeg-dev \
python3 python3-pip python3-dev && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

RUN curl https://sh.rustup.rs -sSf | bash -s -- -y

COPY --from=collector /llama.cpp/bin /app
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py

RUN pip install --no-cache-dir --break-system-packages \
-r /app/gguf-py/requirements.txt

ENTRYPOINT [ "/app/tools.sh" ]


### CLI Only
FROM base AS light

WORKDIR /llama.cpp/bin

# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin

ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]


### Server
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

WORKDIR /llama.cpp/bin

# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin

EXPOSE 8080

ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
Loading