Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
378 commits
Select commit Hold shift + click to select a range
5d5c066
mtmd : fix Pixtral OOM with large images by capping image_size to 102…
yuiseki Jun 22, 2025
af3373f
HIP: enable vec fattn on RDNA4 (#14323)
IMbackK Jun 22, 2025
f1f5e82
examples : fix is_first logic for tokenization (#14329)
ggerganov Jun 22, 2025
66aba7a
run : avoid double tokenization (#14327)
retr0reg Jun 22, 2025
238005c
gguf-py : fix SpecialVocab parsing when post_processor is null (#14330)
CISC Jun 22, 2025
fa4a9f2
quantize : handle user-defined pruning of whole layers (blocks) (#13037)
EAddario Jun 22, 2025
3a9457d
vulkan: update windows SDK in CI (#14334)
jeffbolznv Jun 23, 2025
7b50d58
kv-cells : fix tracking of seq_pos (#14339)
ggerganov Jun 23, 2025
defe215
CUDA: mul_mat_v support for batch sizes > 1 (#14262)
JohannesGaessler Jun 23, 2025
72c6bc3
llama : better rwkv chat template and add missing `inputs.use_jinja` …
MollySophia Jun 23, 2025
bf2a99e
vulkan: update windows SDK in release.yml (#14344)
jeffbolznv Jun 23, 2025
ce82bd0
ci: add workflow for relocatable cmake package (#14346)
bandoti Jun 23, 2025
0142961
CUDA/HIP: optimize mmv paths taken for HIP devices (#14324)
IMbackK Jun 23, 2025
901e20b
jinja : Add Mistral-Small-3.2-24B-Instruct-2506.jinja (#14349)
bartowski1182 Jun 24, 2025
abf2410
main : honor --verbose-prompt on interactive prompts (#14350)
CISC Jun 24, 2025
1b809ce
server : move no API key doc to /health (#14352)
pnb Jun 24, 2025
c148cf1
cmake : use LLAMA_BUILD_NUMBER when defining LLAMA_INSTALL_VERSION (#…
mbaudier Jun 24, 2025
62af464
batch : fix check for empty sequences in memory (#14364)
ggerganov Jun 24, 2025
73e53dc
opencl: ref count `ggml_backend_opencl_context` and refactor profilin…
lhez Jun 24, 2025
2bf9d53
sycl: GGML_SYCL_DISABLE_OPT on by default for all Intel Devices (#13973)
ShanoToni Jun 25, 2025
b193d53
ggml : do not output unprintable characters on GGUF load failure (#14…
CISC Jun 25, 2025
60ef23d
ggml-cpu: enable IBM NNPA Vector Intrinsics (#14317)
taronaeo Jun 25, 2025
716301d
musa: enable fp16 mma (all) and cublas on qy2 (#13842)
yeahdongcn Jun 26, 2025
bf5bcd0
docs: update s390x documentation + add faq (#14389)
taronaeo Jun 26, 2025
5783ae4
metal : batch rows copy in a single threadgroup (#14384)
ggerganov Jun 26, 2025
e8215db
metal : add special-case mat-vec mul for ne00 == 4 (#14385)
ggerganov Jun 26, 2025
b253462
llama : return mistral-v7-tekken as default template only (#14390)
CISC Jun 26, 2025
a01047b
cmake: regen vulkan shaders when shaders-gen sources change (#14398)
bandoti Jun 26, 2025
8846aac
model : gemma3n text-only (#14400)
ngxson Jun 26, 2025
f667f1e
convert : fix broken sentencepiece vocab (#14416)
CISC Jun 27, 2025
8d94219
ggml : add ggml_set_rows (#14274)
rgerganov Jun 27, 2025
4367806
recurrent : call balloc split_reset() in init_batch() (#14414)
ggerganov Jun 27, 2025
72babea
graph : make llm_graph_context destructor virtual (#14410)
ggerganov Jun 27, 2025
ceb1bf5
vulkan: Fix GGML_VULKAN_SHADER_DEBUG_INFO (#14427)
jeffbolznv Jun 28, 2025
6609507
ci : fix windows build and release (#14431)
CISC Jun 28, 2025
b25e927
fix async_mode bug (#14432)
bachelor-dou Jun 28, 2025
566c16f
model : add support for ERNIE 4.5 0.3B model (#14408)
ownia Jun 28, 2025
00d5282
vulkan: lock accesses of pinned_memory vector (#14333)
jeffbolznv Jun 28, 2025
63a7bb3
vulkan: handle noncontig in the final case of ggml_vk_get_cpy_pipelin…
jeffbolznv Jun 28, 2025
27208bf
CUDA: add bf16 and f32 support to cublas_mul_mat_batched (#14361)
am17an Jun 28, 2025
bd9c981
vulkan: Add fusion support for RMS_NORM+MUL (#14366)
jeffbolznv Jun 29, 2025
a0535ff
ggml : implement REGLU/GEGLU/SWIGLU ops (#14158)
CISC Jun 29, 2025
a5d1fb6
ggml : fix unmerged GGML_FPxx_TO_FPxx refactoring (#14443)
CISC Jun 29, 2025
f47c1d7
SYCL: disable faulty fp16 exp kernel (#14395)
qnixsynapse Jun 29, 2025
83790b0
server : fix appearance of the chats list context menu for Safari (#1…
rntk Jun 29, 2025
caf5681
server : support jinja extra template kwargs (Qwen3 enable_thinking f…
matteoserva Jun 29, 2025
e9b6350
scripts : make the shell scripts cross-platform (#14341)
vedranmiletic Jun 30, 2025
c839a2d
cmake : Remove redundant include path in CMakeLists.txt (#14452)
xiaobing318 Jun 30, 2025
eb3fa29
test-backend-ops : disable llama test (#14461)
slaren Jun 30, 2025
a7417f5
ggml-cpu: sycl: Re-enable exp f16 (#14462)
Rbiessy Jun 30, 2025
5dd942d
metal : disable fast-math for some cpy kernels (#14460)
ggerganov Jun 30, 2025
745f11f
memory : correctly handle failure in apply() (#14438)
ggerganov Jun 30, 2025
0a5a3b5
Add Conv2d for CPU (#14388)
am17an Jun 30, 2025
79b33b2
opencl : add GEGLU, REGLU, SWIGLU (#14456)
lhez Jul 1, 2025
497be7c
ggml-quants : rename best_mad to best_error (ggml/1283)
danbev Jun 24, 2025
431b2c2
ggml-cpu : "align corners" for bilinear upscale/downscale (ggml/1285)
Acly Jul 1, 2025
f61c05d
sync : ggml
ggerganov Jul 1, 2025
a6a4795
ggml : remove trailing whitespace (#0)
ggerganov Jul 1, 2025
eff5e45
add GELU_ERF (#14455)
CISC Jul 1, 2025
6a746cf
vulkan: Split large mul_mat_id to fit in shared memory (#14451)
jeffbolznv Jul 1, 2025
343b6e9
CANN: update aclnnGroupedMatmulV2 to aclnnGroupedMatmulV3 (#14411)
noemotiovon Jul 1, 2025
1b2aaf2
Add Vulkan images to docker.md (#14472)
xek Jul 1, 2025
de56944
ci : disable fast-math for Metal GHA CI (#14478)
ggerganov Jul 1, 2025
68b3cd6
ggml : Callback before abort (#14481)
ScaledLizard Jul 2, 2025
85841e1
github : add OpenCL backend to issue templates (#14492)
EZForever Jul 2, 2025
611ba4b
ci : add OpenCL to labeler workflow (#14496)
CISC Jul 2, 2025
603e43d
opencl : update upscale to support align corners (#14488)
lhez Jul 2, 2025
c8a4e47
opencl : skip empty nodes on cgraph compute (#14491)
EZForever Jul 2, 2025
d7f5f4e
simple-chat : fix context-exceeded condition (#14494)
ggerganov Jul 2, 2025
307e79d
opencl : fix possible buffer overflow in dump_tensor (#14490)
jeffzhou2000 Jul 2, 2025
ec68e84
ggml : support bcast ggml_soft_max_ext, ggml_flash_attn_ext (#14435)
ggerganov Jun 27, 2025
8875523
vulkan: support softmax/FA batch and broadcast (#14449)
jeffbolznv Jul 1, 2025
12a81af
CUDA: broadcasting for FlashAttention mask (#14500)
JohannesGaessler Jul 2, 2025
55a1c5a
CUDA: add softmax broadcast (#14475)
am17an Jul 2, 2025
f3ed38d
Set RPATH to "@loader_path" / "$ORIGIN" to ensure executables and dyn…
rotemdan Jul 2, 2025
c46944a
ggml : add version function to get lib version (ggml/1286)
danbev Jul 2, 2025
e17991c
sync : ggml
ggerganov Jul 2, 2025
5d46bab
llama : initial Mamba-2 support (#9126)
compilade Jul 2, 2025
e75ba4c
gguf-py : add support for chat template jinja files (#14508)
CISC Jul 2, 2025
55c2646
CUDA: add dynamic shared mem to softmax, refactor general usage (#14497)
am17an Jul 2, 2025
d4cdd9c
ggml : remove kompute backend (#14501)
ggerganov Jul 3, 2025
9067487
ggml : fix FA mask dim 2 and 3 (#14505)
ggerganov Jul 3, 2025
a70c8a0
kv-cache : use ggml_set_rows (#14285)
ggerganov Jul 3, 2025
0c2ee38
convert : correct gemma 3n conversion (#14450)
ngxson Jul 3, 2025
7b63a71
Fix conditional enabling following arch checks for ggml-sycl (#14504)
s-Nick Jul 3, 2025
c8c4495
ggml: backward pass for split swiglu (#14483)
JohannesGaessler Jul 3, 2025
2b72bed
vulkan: support mixed/deepseekR1 FA head sizes (#14509)
jeffbolznv Jul 3, 2025
bee2842
opencl : broadcast for soft_max (#14510)
lhez Jul 3, 2025
28657a8
ggml : implement GEGLU_ERF and GEGLU_QUICK ops (#14445)
CISC Jul 3, 2025
499a8f5
CANN: Replace aclrtMemsetSync with aclnnInplaceZero operator (#14002)
luyhcsu Jul 4, 2025
c79184d
batch : add n_used count (#14512)
ggerganov Jul 4, 2025
7b50f7c
graph : prepare for 4D mask (#14515)
ggerganov Jul 4, 2025
67d1ef2
batch : add optional for sequential equal split (#14511)
ggerganov Jul 4, 2025
ef797db
metal : disable fast math in all quantize kernels (#14528)
ggerganov Jul 4, 2025
b81510a
test-backend-ops: add support for specifying output format (#14368)
yeahdongcn Jul 5, 2025
bac8bed
eval-callback : check for empty input (#14539)
ggerganov Jul 5, 2025
6681688
opencl: add GELU_ERF (#14476)
CISC Jul 5, 2025
ddef995
server : fix assistant prefilling when content is an array (#14360)
CISC Jul 5, 2025
a0374a6
vulkan: Handle updated FA dim2/3 definition (#14518)
jeffbolznv Jul 5, 2025
0afa563
add geglu activation function (#14074)
huydt84 Jun 9, 2025
25febe5
sycl: Add reorder to Q6_K mmvq implementation (#13885)
s-Nick Jun 9, 2025
87db860
CANN: Simplify the environment variable setting(#13104)
bachelor-dou Jun 9, 2025
6c596e9
graph : fix geglu (#14077)
ggerganov Jun 9, 2025
6bd02a0
ggml-cpu : split arch-specific implementations (#13892)
xctan Jun 9, 2025
09b69d5
llama : allow building all tests on windows when not using shared lib…
slaren Jun 9, 2025
2a09e26
sync : ggml
ggerganov Jun 10, 2025
5b9d89a
Vulkan: Don't default to CPU device (like llvmpipe), even if no other…
0cc4m Jun 10, 2025
c11b827
ggml : fix weak alias win32 (whisper/0)
ggerganov Jun 10, 2025
cae5806
sync : ggml
ggerganov Jun 10, 2025
d0eea7a
llama : support GEGLU for jina-bert-v2 (#14090)
CISC Jun 10, 2025
97d02ff
convert : fix duplicate key DeepSeek-R1 conversion error (#14103)
CISC Jun 10, 2025
573694e
kv-cache : avoid modifying recurrent cells when setting inputs (#13834)
compilade Jun 10, 2025
51f8e6a
opencl: add `mul_mv_id_q4_0_f32_8x_flat` (#14003)
lhez Jun 10, 2025
cc88e69
vulkan: Track descriptor pools/sets per-context (#14109)
jeffbolznv Jun 11, 2025
0ca0399
kv-cache : add LLAMA_KV_CACHE_DEBUG environment variable (#14121)
ggerganov Jun 11, 2025
85c8f78
kv-cache : relax SWA masking condition (#14119)
ggerganov Jun 11, 2025
9106dfc
vulkan: Better thread-safety for command pools/buffers (#14116)
jeffbolznv Jun 11, 2025
170b187
tests : add test-tokenizers-repo (#14017)
CISC Jun 11, 2025
f8590f2
chore : clean up relative source dir paths (#14128)
CISC Jun 11, 2025
aed2b29
Implement GGML_CPU_ALL_VARIANTS for ARM (#14080)
ckastner Jun 11, 2025
16faf81
kv-cache : fix split_equal handling in unified implementation (#14130)
ggerganov Jun 12, 2025
3724d37
batch : remove logits_all flag (#14141)
ggerganov Jun 12, 2025
3fd6eb4
context : simplify output counting logic during decode (#14142)
ggerganov Jun 12, 2025
e57e5ee
cmake : Improve build-info.cpp generation (#14156)
ckastner Jun 13, 2025
40eefac
cmake: Add ability to pass in LLAMA_BUILD_NUMBER/COMMIT (#14167)
ckastner Jun 13, 2025
dfb86c5
batch : rework llama_batch_allocr (#14153)
ggerganov Jun 13, 2025
f8fea17
batch : add LLAMA_BATCH_DEBUG environment variable (#14172)
ggerganov Jun 13, 2025
4d3beb4
Merge commit from fork
GuyGoldenberg Jun 13, 2025
649cd66
vocab : fix build (#14175)
ggerganov Jun 13, 2025
4460d36
batch : auto-gen positions + verify multi-sequence input (#14177)
ggerganov Jun 15, 2025
d406c39
cparams : rename LLAMA_MAX_PARALLEL_SEQUENCES to LLAMA_MAX_SEQ (#14188)
ggerganov Jun 15, 2025
a820a9e
model : add dots.llm1 architecture support (#14044) (#14118)
Noeda Jun 15, 2025
512bd19
model : Add support for Arcee AI's upcoming AFM model (#14185)
bartowski1182 Jun 15, 2025
33844e4
ggml-cpu : rework weak alias on apple targets (#14146)
xctan Jun 16, 2025
b1b8f53
vulkan: mutex around vkQueueSubmit (#14127)
jeffbolznv Jun 16, 2025
47662a3
convert : remove arcee change in convert_hf_to_gguf_update.py (#14207)
bartowski1182 Jun 16, 2025
e9bf3df
ggml: Add Android support for GGML_CPU_ALL_VARIANTS (#14206)
chaxu01 Jun 16, 2025
4863fc5
llama : rework embeddings logic (#14208)
ggerganov Jun 16, 2025
3b7b7bb
model : add NeoBERT (#14164)
huydt84 Jun 16, 2025
9c991f9
cmake: clean up external project logic for vulkan-shaders-gen (#14179)
bandoti Jun 16, 2025
9a2e6cd
server : fix incorrect usage of llama_get_embeddings() (#14225)
ggerganov Jun 16, 2025
4143c93
ggml-cpu : remove the weak alias trick (#14221)
xctan Jun 17, 2025
d763b3e
cmake: remove shader-gen step-targets from ggml-vulkan (#14226)
bandoti Jun 17, 2025
b702324
examples : include examples in msvc disable warn (ggml/1270)
danbev Jun 12, 2025
26ec08a
ggml : disable warnings for tests when using MSVC (ggml/1273)
danbev Jun 13, 2025
60240a7
sync : ggml
ggerganov Jun 18, 2025
aa4c333
convert : fix null head_dim AutoConfig regression (#14248)
CISC Jun 18, 2025
4c3ae3a
ggml: Add Apple support for GGML_CPU_ALL_VARIANTS (#14258)
chaxu01 Jun 18, 2025
ae9ee21
docs: add s390x build documentation (#14264)
taronaeo Jun 18, 2025
bc06756
metal : add mean kernel (#14267)
ggerganov Jun 19, 2025
6bd7a3d
memory : Hybrid recurrent cache (#13979)
gabe-l-hart Jun 19, 2025
bd5017e
llamafile : support s390x SIMD instruction set (#14273)
taronaeo Jun 19, 2025
ff19f0c
ggml-cpu : remove unnecesary arm feature detection (#14281)
slaren Jun 19, 2025
b85c660
CUDA: add conv_2d_dw (#14265)
am17an Jun 20, 2025
26439ad
ubatch : new splitting logic (#14217)
ggerganov Jun 20, 2025
3683408
model : more uniform output id handling (#14275)
ggerganov Jun 20, 2025
d3c5e6d
ggml : fix repack work size for mul_mat_id (#14292)
ggerganov Jun 20, 2025
2b9c9f5
llama : improve sep token handling (#14272)
CISC Jun 20, 2025
55e53f0
Implement GGML_CPU_ALL_VARIANTS for PowerPC (#14286)
ckastner Jun 20, 2025
cd09e2d
sycl: add usage of enqueue_functions extension (#14244)
s-Nick Jun 20, 2025
6420b94
vocab : prevent tokenizer overflow (#14301)
retr0reg Jun 20, 2025
7eb4e7f
lint : remove trailing whitepace (#14304)
CISC Jun 20, 2025
67bdc9d
CUDA: add conv_2d_transpose (#14287)
am17an Jun 20, 2025
313900e
Add `ggml_roll` (ggml/1274)
Acly Jun 18, 2025
e5be49f
sync : ggml
ggerganov Jun 20, 2025
2d597b3
memory : rename interface to llama_memory_context_i (#14296)
ggerganov Jun 21, 2025
634cf89
gguf-py : fix TemplateProcessing pair when bos/eos is missing (#14312)
CISC Jun 21, 2025
afe7dd5
Add support for VK_EXT_debug_utils to add labels to Vulkan objects. (…
mtavenrath Jun 21, 2025
b6a1896
gguf-py : fix Qwen3-Embedding eos token (#14314)
CISC Jun 21, 2025
c00c967
CUDA: add mean operation (#14313)
am17an Jun 22, 2025
689ca95
HIP: enable vec fattn on RDNA4 (#14323)
IMbackK Jun 22, 2025
d88f779
examples : fix is_first logic for tokenization (#14329)
ggerganov Jun 22, 2025
4f75ca6
run : avoid double tokenization (#14327)
retr0reg Jun 22, 2025
1f044a1
gguf-py : fix SpecialVocab parsing when post_processor is null (#14330)
CISC Jun 22, 2025
65f31a0
kv-cells : fix tracking of seq_pos (#14339)
ggerganov Jun 23, 2025
f01962e
CUDA: mul_mat_v support for batch sizes > 1 (#14262)
JohannesGaessler Jun 23, 2025
8d719d0
CUDA/HIP: optimize mmv paths taken for HIP devices (#14324)
IMbackK Jun 23, 2025
e7e42a6
cmake : use LLAMA_BUILD_NUMBER when defining LLAMA_INSTALL_VERSION (#…
mbaudier Jun 24, 2025
f3bca29
batch : fix check for empty sequences in memory (#14364)
ggerganov Jun 24, 2025
3f52740
opencl: ref count `ggml_backend_opencl_context` and refactor profilin…
lhez Jun 24, 2025
7beb268
ggml-cpu: enable IBM NNPA Vector Intrinsics (#14317)
taronaeo Jun 25, 2025
1f6a41e
musa: enable fp16 mma (all) and cublas on qy2 (#13842)
yeahdongcn Jun 26, 2025
63bae38
docs: update s390x documentation + add faq (#14389)
taronaeo Jun 26, 2025
aea1eca
metal : batch rows copy in a single threadgroup (#14384)
ggerganov Jun 26, 2025
5deba9e
cmake: regen vulkan shaders when shaders-gen sources change (#14398)
bandoti Jun 26, 2025
bd44d6a
model : gemma3n text-only (#14400)
ngxson Jun 26, 2025
b3c74fe
convert : fix broken sentencepiece vocab (#14416)
CISC Jun 27, 2025
d518bdb
ggml : add ggml_set_rows (#14274)
rgerganov Jun 27, 2025
30f100d
recurrent : call balloc split_reset() in init_batch() (#14414)
ggerganov Jun 27, 2025
1fcf508
graph : make llm_graph_context destructor virtual (#14410)
ggerganov Jun 27, 2025
1a29b38
ci : fix windows build and release (#14431)
CISC Jun 28, 2025
241656e
fix async_mode bug (#14432)
bachelor-dou Jun 28, 2025
2ae4c3b
model : add support for ERNIE 4.5 0.3B model (#14408)
ownia Jun 28, 2025
10d2f66
vulkan: lock accesses of pinned_memory vector (#14333)
jeffbolznv Jun 28, 2025
a9478f0
CUDA: add bf16 and f32 support to cublas_mul_mat_batched (#14361)
am17an Jun 28, 2025
a15aa48
vulkan: Add fusion support for RMS_NORM+MUL (#14366)
jeffbolznv Jun 29, 2025
19505b5
ggml : implement REGLU/GEGLU/SWIGLU ops (#14158)
CISC Jun 29, 2025
8f73d07
ggml : fix unmerged GGML_FPxx_TO_FPxx refactoring (#14443)
CISC Jun 29, 2025
2228e0a
SYCL: disable faulty fp16 exp kernel (#14395)
qnixsynapse Jun 29, 2025
57bc4e8
scripts : make the shell scripts cross-platform (#14341)
vedranmiletic Jun 30, 2025
2f1765d
cmake : Remove redundant include path in CMakeLists.txt (#14452)
xiaobing318 Jun 30, 2025
69af9aa
test-backend-ops : disable llama test (#14461)
slaren Jun 30, 2025
fa22d2c
ggml-cpu: sycl: Re-enable exp f16 (#14462)
Rbiessy Jun 30, 2025
7240127
metal : disable fast-math for some cpy kernels (#14460)
ggerganov Jun 30, 2025
33134bd
memory : correctly handle failure in apply() (#14438)
ggerganov Jun 30, 2025
9f14ca8
Add Conv2d for CPU (#14388)
am17an Jun 30, 2025
bdb1f3d
opencl : add GEGLU, REGLU, SWIGLU (#14456)
lhez Jul 1, 2025
a0252fa
ggml-cpu : "align corners" for bilinear upscale/downscale (ggml/1285)
Acly Jul 1, 2025
606c87e
sync : ggml
ggerganov Jul 1, 2025
3c0d890
ggml : remove trailing whitespace (#0)
ggerganov Jul 1, 2025
e362ffa
vulkan: Split large mul_mat_id to fit in shared memory (#14451)
jeffbolznv Jul 1, 2025
73b416c
github : add OpenCL backend to issue templates (#14492)
EZForever Jul 2, 2025
d405569
ci : add OpenCL to labeler workflow (#14496)
CISC Jul 2, 2025
45405f5
opencl : update upscale to support align corners (#14488)
lhez Jul 2, 2025
bc1bc42
opencl : skip empty nodes on cgraph compute (#14491)
EZForever Jul 2, 2025
b65a3dd
simple-chat : fix context-exceeded condition (#14494)
ggerganov Jul 2, 2025
c5fbb52
opencl : fix possible buffer overflow in dump_tensor (#14490)
jeffzhou2000 Jul 2, 2025
6b89b76
ggml : support bcast ggml_soft_max_ext, ggml_flash_attn_ext (#14435)
ggerganov Jun 27, 2025
f69788b
vulkan: support softmax/FA batch and broadcast (#14449)
jeffbolznv Jul 1, 2025
29269ed
CUDA: broadcasting for FlashAttention mask (#14500)
JohannesGaessler Jul 2, 2025
438b7fa
CUDA: add softmax broadcast (#14475)
am17an Jul 2, 2025
cd1222b
Set RPATH to "@loader_path" / "$ORIGIN" to ensure executables and dyn…
rotemdan Jul 2, 2025
a726483
ggml : add version function to get lib version (ggml/1286)
danbev Jul 2, 2025
c8aba6b
sync : ggml
ggerganov Jul 2, 2025
c07421d
llama : initial Mamba-2 support (#9126)
compilade Jul 2, 2025
067afe2
gguf-py : add support for chat template jinja files (#14508)
CISC Jul 2, 2025
e7092f1
CUDA: add dynamic shared mem to softmax, refactor general usage (#14497)
am17an Jul 2, 2025
03d390a
ggml : remove kompute backend (#14501)
ggerganov Jul 3, 2025
f336e6e
ggml : fix FA mask dim 2 and 3 (#14505)
ggerganov Jul 3, 2025
c9b4442
kv-cache : use ggml_set_rows (#14285)
ggerganov Jul 3, 2025
ede5664
convert : correct gemma 3n conversion (#14450)
ngxson Jul 3, 2025
95614c2
Fix conditional enabling following arch checks for ggml-sycl (#14504)
s-Nick Jul 3, 2025
8e5beab
ggml: backward pass for split swiglu (#14483)
JohannesGaessler Jul 3, 2025
3c4bb4b
vulkan: support mixed/deepseekR1 FA head sizes (#14509)
jeffbolznv Jul 3, 2025
e75eb3d
opencl : broadcast for soft_max (#14510)
lhez Jul 3, 2025
5dca438
ggml : implement GEGLU_ERF and GEGLU_QUICK ops (#14445)
CISC Jul 3, 2025
b0fa27f
CANN: Replace aclrtMemsetSync with aclnnInplaceZero operator (#14002)
luyhcsu Jul 4, 2025
732d0ed
batch : add n_used count (#14512)
ggerganov Jul 4, 2025
567b16c
graph : prepare for 4D mask (#14515)
ggerganov Jul 4, 2025
13df0aa
batch : add optional for sequential equal split (#14511)
ggerganov Jul 4, 2025
906f2ab
metal : disable fast math in all quantize kernels (#14528)
ggerganov Jul 4, 2025
33a0f5f
test-backend-ops: add support for specifying output format (#14368)
yeahdongcn Jul 5, 2025
706a60f
eval-callback : check for empty input (#14539)
ggerganov Jul 5, 2025
0aafe1b
opencl: add GELU_ERF (#14476)
CISC Jul 5, 2025
344c1ce
server : fix assistant prefilling when content is an array (#14360)
CISC Jul 5, 2025
2f01628
vulkan: Handle updated FA dim2/3 definition (#14518)
jeffbolznv Jul 5, 2025
3db975e
Refactor and optimize various components in the ggml library
Minh141120 Jul 5, 2025
3bde2ed
ci: add ARM64 support for Windows in build matrix
Minh141120 Jul 5, 2025
f45f4f5
ci: add pull request trigger for dev branch in workflow
Minh141120 Jul 5, 2025
6b97e7f
ci: update pull request trigger paths for dev branch
Minh141120 Jul 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/010-bug-compilation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/011-bug-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
Expand Down
11 changes: 5 additions & 6 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute/**
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
Expand Down Expand Up @@ -93,3 +87,8 @@ Ascend NPU:
- ggml/include/ggml-cann.h
- ggml/src/ggml-cann/**
- docs/backend/CANN.md
OpenCL:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-opencl.h
- ggml/src/ggml-opencl/**
11 changes: 1 addition & 10 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -740,9 +740,6 @@ jobs:
- build: 'llvm-arm64-opencl-adreno'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
# - build: 'kompute-x64'
# arch: 'x64'
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'

steps:
- name: Clone
Expand All @@ -756,12 +753,6 @@ jobs:
variant: ccache
evict-old-files: 1d

- name: Clone Kompute submodule
id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }}
run: |
git submodule update --init ggml/src/ggml-kompute/kompute
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
Expand All @@ -777,7 +768,7 @@ jobs:
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
if: ${{ matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Expand Down
433 changes: 223 additions & 210 deletions .github/workflows/menlo-build.yml

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ jobs:
run: |
sysctl -a
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
Expand Down Expand Up @@ -103,7 +104,8 @@ jobs:
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
Expand Down Expand Up @@ -160,6 +162,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
Expand Down Expand Up @@ -211,6 +215,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
Expand Down
3 changes: 0 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
[submodule "kompute"]
path = ggml/src/ggml-kompute/kompute
url = https://github.com/nomic-ai/kompute.git
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ endfunction()

llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
Expand Down
115 changes: 111 additions & 4 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4408,9 +4408,6 @@ def __init__(self, *args, **kwargs):
]

def set_vocab(self):
with open(self.dir_model / "chat_template.jinja") as f:
# quick hack to make sure chat template is added
self.gguf_writer.add_chat_template(f.read())
super().set_vocab()

def set_gguf_parameters(self):
Expand Down Expand Up @@ -4781,6 +4778,14 @@ def set_gguf_parameters(self):
class MambaModel(TextModel):
model_arch = gguf.MODEL_ARCH.MAMBA

def __init__(self, dir_model: Path, *args, **kwargs):
# Avoid using AutoConfig for hparams
hparams = kwargs.pop("hparams", None)
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
super().__init__(dir_model, *args, hparams=hparams, **kwargs)

def set_vocab(self):
vocab_size = self.hparams["vocab_size"]
# Round vocab size to next multiple of 8
Expand Down Expand Up @@ -4855,6 +4860,100 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return [(new_name, data_torch)]


@ModelBase.register("Mamba2ForCausalLM")
class Mamba2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MAMBA2

def __init__(self, dir_model: Path, *args, **kwargs):
# Avoid using AutoConfig for hparams
# It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
hparams = kwargs.pop("hparams", None)
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
super().__init__(dir_model, *args, hparams=hparams, **kwargs)

def set_vocab(self):
vocab_size = self.hparams["vocab_size"]
# Round vocab size to next multiple of 16
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
# pad using ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
self.hparams["vocab_size"] = vocab_size

if (self.dir_model / "tokenizer.model").is_file():
self._set_vocab_sentencepiece()
elif (self.dir_model / "tokenizer.model.v3").is_file():
# mamba-codestral
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
elif (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
else:
# Use the GPT-NeoX tokenizer when no tokenizer files are present
self._set_vocab_builtin("gpt-neox", vocab_size)

def set_gguf_parameters(self):
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
head_dim = self.find_hparam(["head_dim"], optional=True) or 64
n_group = self.find_hparam(["n_groups"], optional=True) or 1

rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

# Fail early for models which don't have a block expansion factor of 2
# TODO: does this really matter?
assert d_inner == 2 * d_model
assert d_inner % head_dim == 0

self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(d_inner // head_dim)
self.gguf_writer.add_ssm_group_count(n_group)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_file_type(self.ftype)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:

if name.startswith("model.backbone") or name.startswith("model.lm_head"):
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
name = name.removeprefix("model.")

if name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"

new_name = self.map_tensor_name(name)

if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
data_torch = data_torch.squeeze()
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
gguf.MODEL_TENSOR.SSM_A,
gguf.MODEL_TENSOR.SSM_D,
]):
# unsqueeze A to use similar shape semantics as Mamba-1
# (D is also unsqueezed, but for more straightforward broadcast internally)
data_torch = data_torch.reshape((*data_torch.shape, 1))
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
n_group = self.hparams.get("n_groups", 1)
data_torch = data_torch.reshape((n_group, d_inner // n_group))

if name.endswith(".A_log"):
logger.debug("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch)

yield (new_name, data_torch)


@ModelBase.register("CohereForCausalLM")
class CommandR2Model(TextModel):
model_arch = gguf.MODEL_ARCH.COMMAND_R
Expand Down Expand Up @@ -6615,12 +6714,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# maybe we should fallback to text model's arch in that case, since not many models have both
text_config = hparams.get("text_config", {})
vision_config = hparams.get("vision_config", {})
arch = hparams["architectures"][0]
arch = None
if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
arch = arches[0]
elif "ssm_cfg" in hparams:
# For non-hf Mamba and Mamba2 models
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"

# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
arch = text_config["architectures"][0]
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
arch = vision_config["architectures"][0]
if arch is None:
raise ValueError("Failed to detect model architecture")
return arch


Expand Down
5 changes: 5 additions & 0 deletions examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ static bool run(llama_context * ctx, const common_params & params) {

std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);

if (tokens.empty()) {
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
return false;
}

if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
Expand Down
7 changes: 4 additions & 3 deletions examples/simple-chat/simple-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,16 @@ int main(int argc, char ** argv) {
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1;
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
exit(0);
}

if (llama_decode(ctx, batch)) {
GGML_ABORT("failed to decode\n");
int ret = llama_decode(ctx, batch);
if (ret != 0) {
GGML_ABORT("failed to decode, ret = %d\n", ret);
}

// sample the next token
Expand Down
9 changes: 7 additions & 2 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
Expand Down Expand Up @@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
include/ggml-cann.h
include/ggml-cpp.h
include/ggml-cuda.h
include/ggml-kompute.h
include/ggml-opt.h
include/ggml-metal.h
include/ggml-rpc.h
Expand Down Expand Up @@ -360,6 +358,13 @@ write_basic_package_version_file(
VERSION ${GGML_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)

target_compile_definitions(ggml-base PRIVATE
GGML_VERSION="${GGML_INSTALL_VERSION}"
GGML_COMMIT="${GGML_BUILD_COMMIT}"
)
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
Expand Down
50 changes: 0 additions & 50 deletions ggml/include/ggml-kompute.h

This file was deleted.

Loading
Loading