Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
1621 commits
Select commit Hold shift + click to select a range
dd62dcf
convert : Make mistral-common dependency optional (#16738)
juliendenize Oct 23, 2025
0bf47a1
server: add memory breakdown print (#16740)
JohannesGaessler Oct 23, 2025
f8f071f
convert : handle pre-quantized models (#14810)
compilade Oct 23, 2025
5a91109
model-conversion : add trust_remote_code for orig model run [no ci] (…
danbev Oct 24, 2025
69e9ff0
webui: support q URL parameter (#16728)
odrling Oct 24, 2025
0bcb40b
CUDA: use CUB for arbitary size argsort (#16754)
am17an Oct 24, 2025
55945d2
ggml: fix CUDA grid launch condition for large block_nums.y in binbca…
leejet Oct 24, 2025
5cca254
convert : avoid dequantizing mxfp4 for GPT-OSS (#16756)
compilade Oct 25, 2025
8423d01
vulkan: Optimize SSM_SCAN (#16645)
jeffbolznv Oct 25, 2025
f90b4a8
vulkan: delete dead code (#16732)
giuseppe Oct 25, 2025
226f295
model : set res->t_embd in PLaMo2 models (#16766)
mitmul Oct 25, 2025
5d195f1
convert : handle mmproj filename/path properly (#16760)
Galunid Oct 25, 2025
3cfa9c3
vulkan: deduplicate Microsoft Direct3D12 devices (#16689)
giladgd Oct 26, 2025
f77c13b
CUDA: General GEMV fusion (#16715)
am17an Oct 26, 2025
8d88628
docs : add Jamba to Text-only models list (#16778)
amirai21 Oct 26, 2025
7cce4f8
model : set res->t_embd in SmallThinker models (#16782)
CISC Oct 26, 2025
f696428
graph : add clamping to ffn_moe_weights_sum to avoid div-by-zero (#16…
CISC Oct 26, 2025
73a48c9
convert : enable expert group selection for all models with it (#16691)
CISC Oct 26, 2025
bbac6a2
ggml: fix cuda kernel launch configuration for k_compute_batched_ptrs…
leejet Oct 26, 2025
bd562fe
cuda : use fast copy when src and dst are of different type and conti…
CISC Oct 26, 2025
3470a5c
ggml-alloc : make gallocr prefer chunks that allow memory reuse (#16788)
Acly Oct 26, 2025
75d33b9
CUDA: support for weight clamp in top-k norm (#16702)
am17an Oct 27, 2025
59fc1ec
sycl: add REPEAT_BACK operation support (#16734)
shani-f Oct 27, 2025
2b9bd9b
sycl: add ROLL operation support (#16665)
tamarPal Oct 27, 2025
75cbdd3
test-backend-ops: print failed tests at the end (#16785)
am17an Oct 27, 2025
945501f
llama: fix leaked buffers for mmap + split files (#16765)
JohannesGaessler Oct 27, 2025
c55d53a
model : add LightOnOCR-1B model (#16764)
ngxson Oct 27, 2025
80d28f1
HIP: fix AMDGPU_TARGETS, update documentation (#16803)
JohannesGaessler Oct 27, 2025
10640e3
ggml : fix interpolate with align-corners and ne=1 (#16700)
Acly Oct 27, 2025
5a4ff43
llama : disable pipeline parallelism if compute buffer allocation fai…
slaren Oct 27, 2025
e1ab084
mtmd : fix idefics3 preprocessing (#16806)
ngxson Oct 27, 2025
c053e18
chat: Add LFM2 tool handling (#16763)
ykhrustalev Oct 27, 2025
ad8d36b
sycl: add SSM_CONV operation support (#16800)
tamarPal Oct 28, 2025
463bbf2
CUDA: add unused vars to mmvf and mmvq (#16807)
am17an Oct 28, 2025
3479efd
CANN: Improve device ID handling and aclnnArange checks (#16752)
noemotiovon Oct 28, 2025
280d97b
grammar : support array references in json schema (#16792)
aldehir Oct 28, 2025
7a0e900
llama: consistent ctx <-> buf order for KV cache (#16746)
JohannesGaessler Oct 28, 2025
1c1409e
embedding: add raw option for --embd-output-format (#16541)
SamMalayek Oct 28, 2025
8284efc
initialise buffer.device in ggml_hexagon_session (#16816)
l3utterfly Oct 28, 2025
a8ca18b
llama-bench : clarify benchmarked parts of the computation (#16823)
ggerganov Oct 28, 2025
85a7d86
memory : remove KV cache size padding (#16812)
ggerganov Oct 28, 2025
851553e
cuda: add SET operation support (#16804)
YaelGitAccount Oct 28, 2025
338074c
sycl: add RMS_NORM_BACK operation support (#16808)
YaelLogic Oct 29, 2025
9a3ea68
CUDA: Fix bug in topk-moe for gpt-oss (#16821)
am17an Oct 29, 2025
f549b00
vulkan: Call ggml_vk_buffer_write_2d from ggml_vk_buffer_copy (#16793)
jeffbolznv Oct 29, 2025
144a4ce
vendor : sync minja (#16500)
CISC Oct 29, 2025
e41bcce
CUDA: use fastdiv in set-rows (#16834)
am17an Oct 29, 2025
3eb2be1
Hexagon Op queue & dispatch optimizations (#16820)
max-krasnyansky Oct 29, 2025
bcf5bda
Vulkan MMQ Integer Dot Refactor and K-Quant support (#16536)
0cc4m Oct 29, 2025
10fcc41
vulkan: Update topk_moe fusion to handle gpt's late softmax (#16656)
jeffbolznv Oct 29, 2025
e3af556
llama: store mrope data in KV cell (#16825)
ngxson Oct 29, 2025
3464bda
llama: fix ASAN error with M-RoPE (#16848)
ngxson Oct 29, 2025
b9ce940
vulkan: Fuse rope+set_rows (#16769)
jeffbolznv Oct 29, 2025
8b11dee
Hide latency of bias and gate-loading (#16847)
ORippler Oct 30, 2025
052df28
vulkan: Handle argsort with a large number of rows (#16851)
jeffbolznv Oct 30, 2025
d739511
llama : use std::abs instead of abs (#16853)
kaetemi Oct 30, 2025
229bf68
cuda : fix argsort with 64k+ rows (#16849)
CISC Oct 30, 2025
bacddc0
model: Add support for CogVLM model (#15002)
Tianyue-Zhao Oct 30, 2025
dcca0d3
cpu: introduce chunking for flash attention (#16829)
max-krasnyansky Oct 30, 2025
d261223
model: add support for qwen3vl series (#16780)
JJJYmmm Oct 30, 2025
835e918
common: fix typo in cli help text (#16864)
sbera77 Oct 30, 2025
517b717
cpu: introduce chunking for repack matmuls and enable matmul-id chunk…
max-krasnyansky Oct 30, 2025
b52edd2
server : remove n_past (#16818)
ggerganov Oct 30, 2025
16724b5
server : bump request URI max length to 32768 (#16862)
chansikpark Oct 30, 2025
ce18efe
convert : update transformers requirements (#16866)
RodriMora Oct 30, 2025
9984cbb
opencl: fix boundary handling for mul_mm (#16875)
lhez Oct 30, 2025
6eb208d
ci : enable free-disk-space on cuda docker build (#16877)
CISC Oct 30, 2025
13002a0
ggml-hexagon: respect input size when getting/setting tensor data (#1…
l3utterfly Oct 31, 2025
d2a2673
vulkan: fix shmem overrun in mmq id shader (#16873)
0cc4m Oct 31, 2025
2976b03
vulkan: Fix crash when FP16 mul_mat accumulation is not supported (#1…
rillomas Oct 31, 2025
d2d931f
vulkan: disable spirv-opt for rope shaders (#16872)
jeffbolznv Oct 31, 2025
0f715b4
server : fix typos in server.cpp comments [no ci] (#16883)
danbev Oct 31, 2025
c22473b
server : don't print user inputs to console (#16871)
ggerganov Oct 31, 2025
8da3c0e
batch : fix consistency checks for the input positions (#16890)
ggerganov Oct 31, 2025
4146d6a
CUDA: add expert reduce kernel (#16857)
am17an Oct 31, 2025
6d39015
sync : ggml
ggerganov Oct 31, 2025
31c511a
CUDA: Volta tensor core support for MMF (#16843)
JohannesGaessler Oct 31, 2025
e58d585
model : add Granite Hybrid nano types (#16896)
giuseppe Oct 31, 2025
0de0a01
model : Minimax M2 (#16831)
pwilkin Oct 31, 2025
bea0452
refactor : llama-model.cpp (#16252)
pwilkin Oct 31, 2025
d3dc9dd
CUDA: Remove unneded bias/gate dims in fused mmvq (#16858)
ORippler Nov 1, 2025
2e76e01
vulkan: fuse mul_mat+add and mul_mat_id+add_id (#16868)
jeffbolznv Nov 1, 2025
5d8bb90
vulkan: Fix multi_add invalid descriptor usage (#16899)
jeffbolznv Nov 1, 2025
74fef41
codeowners : update after refactor (#16905)
CISC Nov 1, 2025
961660b
common : allow --system-prompt-file for diffusion-cli (#16903)
CISC Nov 1, 2025
1ae7488
webui: recognize AsciiDoc files as valid text files (#16850)
jhradilek Nov 1, 2025
d8b860a
Add a setting to display message generation statistics (#16901)
allozaur Nov 1, 2025
cf659bb
mtmd: refactor preprocessing + support max/min pixels (#16878)
ngxson Nov 1, 2025
dd5e8ca
vendor : update cpp-httplib to 0.27.0 (#16846)
angt Nov 1, 2025
e4a7159
webui: add HTML/JS preview support to MarkdownContent with sandboxed …
ServeurpersoCom Nov 1, 2025
2f68ce7
webui: auto-refresh /props on inference start to resync model metadat…
ServeurpersoCom Nov 1, 2025
7fd205a
scripts : add script to bench models (#16894)
ggerganov Nov 1, 2025
d38d9f0
ggml: add s390x cpu-feats (#16774)
taronaeo Nov 2, 2025
a864132
devops: fix failing s390x docker build (#16918)
taronaeo Nov 2, 2025
7db35a7
CUDA: add FLOOR, CEIL, ROUND, TRUNC unary ops (#16917)
mnehete32 Nov 2, 2025
76af40a
docs: remove llama_sampler_accept reference in sampling sample usage …
alundb Nov 2, 2025
87c9efc
common : move gpt-oss reasoning processing to init params (#16937)
aldehir Nov 2, 2025
cd5e3b5
server : support unified cache across slots (#16736)
ggerganov Nov 2, 2025
2f966b8
clip : use FA (#16837)
ggerganov Nov 2, 2025
6b9a524
model: add Janus Pro for image understanding (#16906)
ravenouse Nov 2, 2025
dd52868
ci : disable failing riscv cross build (#16952)
CISC Nov 2, 2025
a2054e3
test-backend-ops : fix segfault in moe-expert-reduce test in support …
sbera77 Nov 2, 2025
bcfa876
feat(webui): improve LaTeX rendering with currency detection (#16508)
srogmann Nov 2, 2025
7e99416
SYCL: optimized repeat_back kernel (3× fewer asm instructions, 2× fas…
shani-f Nov 3, 2025
ee3a5a1
sync: minja (glm 4.6 & minmax m2 templates) (#16949)
ochafik Nov 3, 2025
fcfce04
ggml : LoongArch fixes (#16958)
MQ-mengqing Nov 3, 2025
bf7b0c9
mtmd: pad mask for qwen2.5vl (#16954)
ngxson Nov 3, 2025
070ff4d
mtmd: add --image-min/max-tokens (#16921)
ngxson Nov 3, 2025
622cd01
ggml: CUDA: add head size 72 for flash-attn (#16962)
theo77186 Nov 3, 2025
48bd265
server : add props.model_alias (#16943)
ggerganov Nov 3, 2025
ed8aa63
model-conversion : pass config to from_pretrained (#16963)
danbev Nov 3, 2025
e7da30b
fix: Viewing multiple PDF attachments (#16974)
allozaur Nov 3, 2025
c5023da
opencl: support imrope (#16914)
lhez Nov 3, 2025
2759ccd
CUDA: avoid mul + bias fusion when doing fusion (#16935)
am17an Nov 4, 2025
1f5accb
Fix garbled output with REPACK at high thread counts (#16956)
NoahOksuz Nov 4, 2025
b164259
chore : fix models indent after refactor (#16992)
CISC Nov 4, 2025
d945834
ci : apply model label to models (#16994)
CISC Nov 4, 2025
cc98f8d
ggml-cpu : bicubic interpolation (#16891)
Acly Nov 4, 2025
afd3532
readme : update hot topics (#17002)
ggerganov Nov 4, 2025
66d8ecc
server : do context shift only while generating (#17000)
ggerganov Nov 4, 2025
ad51c0a
vulkan: remove the need for the dryrun (#16826)
jeffbolznv Nov 4, 2025
a5c07dc
refactor: replace sprintf with snprintf for safer string handling in …
chraac Nov 4, 2025
5e90233
opencl: update doc (#17011)
lhez Nov 5, 2025
9aa6337
CUDA: update ops.md (#17005)
mnehete32 Nov 5, 2025
852ce51
ggml : fix conv2d_dw SVE path (ggml/1380)
ggerganov Nov 4, 2025
cdabeb2
sync : ggml
ggerganov Nov 4, 2025
03ea041
ggml webgpu: minor set rows optimization (#16810)
reeselevine Nov 5, 2025
9f05247
model : add openPangu-Embedded (#16941)
Lpzhan931 Nov 5, 2025
fd2f84f
docs: Clarify the endpoint that webui uses (#17001)
openingnow Nov 5, 2025
2f0c2db
mtmd: improve struct initialization (#16981)
ngxson Nov 5, 2025
13b339b
server : do not default to multiple slots with speculative decoding (…
ggerganov Nov 5, 2025
92bb84f
mtmd: allow QwenVL to process larger image by default (#17020)
ngxson Nov 5, 2025
5886f4f
examples(gguf): GGUF example outputs (#17025)
gabe-l-hart Nov 5, 2025
a44d771
vulkan: Fix GGML_VULKAN_CHECK_RESULTS to better handle fusion (#16919)
jeffbolznv Nov 5, 2025
230d116
improve CUDA cpy memory bandwidth when copying transposed tensor (#1…
bssrdf Nov 5, 2025
6db3d1f
ggml-hexagon: graceful fallback for older socs where rpcmem_alloc2 an…
l3utterfly Nov 6, 2025
22c8c3c
docs: explain CUDA 11 compilation [no ci] (#16824)
JohannesGaessler Nov 6, 2025
9d7c518
sycl: add CONCAT operator support (#16047)
ye-NX Nov 6, 2025
4882f0f
clip: implement minicpm-v sinusoidal embd using GGML (#17036)
ngxson Nov 6, 2025
b7f9010
server : disable checkpoints with mtmd (#17045)
ggerganov Nov 6, 2025
5b180c3
metal : initial Metal4 tensor API support (#16634)
ggerganov Nov 6, 2025
aa37417
CUDA: fix crash on uneven context without FA (#16988)
JohannesGaessler Nov 6, 2025
7f09a68
ggml-cpu : optimize RVV q2_k and q3_k kernels (#16887)
xctan Nov 6, 2025
5c9a18e
common: move download functions to download.(cpp|h) (#17059)
ngxson Nov 7, 2025
8c0d6bb
server : print the samplers chain for each request (#17070)
ggerganov Nov 7, 2025
7c23f3f
ggml-cpu: detect correct cpu flags for arm64 (#16229) (#16239)
lizhenneng Nov 7, 2025
9eb9a13
Revert "ggml-cpu: detect correct cpu flags for arm64 (#16229) (#16239…
angt Nov 7, 2025
16bcc12
kv-cache : pad the cache size to 256 for performance (#17046)
ggerganov Nov 7, 2025
9008027
hparams : add n_embd_inp() to support extended embed (#16928)
CISC Nov 7, 2025
7956bb4
bench : cache the llama_context state at computed depth (#16944)
ggerganov Nov 7, 2025
6515610
CUDA: fix should_use_mmvf for ne11 == 1 (#17085)
JohannesGaessler Nov 7, 2025
ac76d36
vulkan : refactor buffer handling in vk_op_f32 (#16840)
Acly Nov 7, 2025
299f5d7
CUDA: properly handle nb00=nb02 case for cpy (#17081)
bssrdf Nov 7, 2025
647b960
ggml webgpu: faster matrix multiplication/matrix-vector multiplicatio…
reeselevine Nov 8, 2025
e14e842
CUDA: fix MMQ stream-k fixup ne1 indices (#17089)
JohannesGaessler Nov 8, 2025
d6fe40f
vulkan: Fix test-thread-safety crashes (#17024)
jeffbolznv Nov 8, 2025
b4e335d
vulkan: fuse rms_norm + mul + rope (+ view + set_rows) (#16977)
jeffbolznv Nov 8, 2025
08416eb
ggml: disable vxe for cross-compilation by default (#16966)
AlekseiNikiforovIBM Nov 8, 2025
b8a5cfd
vulkan: Increase BK to 32; use BK/4 for non-CM mul_mm.comp (#16636)
SavicStefan Nov 8, 2025
c1b1876
CUDA: skip fusion for repeating adds in bias (#17080)
am17an Nov 8, 2025
64fe17f
Revert "CUDA: add expert reduce kernel (#16857)" (#17100)
am17an Nov 8, 2025
eeee367
server: fix correct time_ms calculation in prompt_progress (#17093)
gSUz92nc Nov 8, 2025
53d7d21
vulkan: Use spec constants for conv2d s/d/p and kernel W/H (#16978)
jeffbolznv Nov 8, 2025
333f259
webui: fix keyboard shortcuts for new chat & edit chat title (#17007)
chansikpark Nov 8, 2025
aa3b7a9
arg: add --cache-list argument to list cached models (#17073)
ngxson Nov 8, 2025
0750a59
metal : retain src and dst buffers during async ops (#17101)
ggerganov Nov 9, 2025
80a6cf6
vulkan: fuse mul_mat_id + mul (#17095)
jeffbolznv Nov 9, 2025
8a3519b
vulkan: fix mmq out of bounds reads (#17108)
0cc4m Nov 9, 2025
7f3e9d3
vulkan: iGPU memory reporting fix (#17110)
0cc4m Nov 9, 2025
86fde91
Switch to using Ubuntu 25.10 vulkan/mesa (#16497)
ericcurtin Nov 9, 2025
ef1d826
benches : add folder with benchmarks (#16931)
ggerganov Nov 9, 2025
cb1adf8
server : handle failures to restore host cache (#17078)
ggerganov Nov 9, 2025
1c07c0c
convert : handle compressed-tensors quant method (#17069)
compilade Nov 9, 2025
802cef4
convert : parse safetensors directly (#15667)
compilade Nov 9, 2025
392e09a
vulkan: fix memory allocations (#17122)
0cc4m Nov 9, 2025
b8595b1
mtmd : fix embedding size for image input (#17123)
ggerganov Nov 9, 2025
15274c0
benches : add eval results (#17139)
ggerganov Nov 10, 2025
1032256
cuda/vulkan : bicubic interpolation (#17022)
Acly Nov 10, 2025
9898b57
editorconfig : ignore benches/ (#17140)
ggerganov Nov 10, 2025
4b13a68
mtmd: fix patch_size initialized to random value in audio models (#17…
ngxson Nov 10, 2025
f914544
batched-bench : add "separate text gen" mode (#17103)
ggerganov Nov 10, 2025
df70bed
arm64: add i8mm route with SVE ggml_vec_dot_q4_K_q8_K and ggml_vec_do…
fj-y-saito Nov 10, 2025
c27efd2
metal : enable tensor API for A19 (#17087)
ggerganov Nov 10, 2025
0c74f32
memory: Hybrid context shift (#17009)
gabe-l-hart Nov 10, 2025
85234a4
vulkan: fix validation issue introduced by #16868 (#17145)
0cc4m Nov 10, 2025
f117be1
vulkan: check glslc executable string (#17144)
0cc4m Nov 10, 2025
967eb4b
ggml-cpu : inspect -march and -mcpu to found the CPU (#16333)
angt Nov 10, 2025
13730c1
metal : cap threadgroups size of set_rows (#17146)
ggerganov Nov 10, 2025
395e286
cpu: skip NOPs to avoid barriers (#17133)
max-krasnyansky Nov 10, 2025
7bef684
models : move build_inp_out_ids outside loop (#17151)
CISC Nov 10, 2025
ece0f5c
opencl: add fastdiv and use it in set_rows, ported from cuda (#17090)
lhez Nov 10, 2025
2fc392c
convert : register UMT5Model architecture for T5 conversion (#17160)
levkropp Nov 11, 2025
d2d6269
Install rpc-server when GGML_RPC is ON. (#17149)
nbp Nov 11, 2025
4a5b8af
cmake : add version to all shared object files (#17091)
furrysalamander Nov 11, 2025
8c58324
kleidiai: add optimized per-channel kernels for Q8_0 (#16993)
chaxu01 Nov 11, 2025
73460f6
ggml-cpu: templateify ggml_compute_forward_rope_f32 and _f16 (#16805)
duduta Nov 11, 2025
ca48440
ggml-cpu : add RISC-V RVV (Zvfh) optimization for FP16 to FP32 conver…
ixgbe Nov 11, 2025
1d45b42
vendor: split httplib to cpp/h files (#17150)
ngxson Nov 11, 2025
3fe36c3
ci: add Arm-hosted Graviton4 runner (#17021)
sudhiarm Nov 11, 2025
7d019cf
disable rms norm mul rope for chips with no fp16 rte (#17134)
netrunnereve Nov 11, 2025
c273d75
hexagon: various Op fixes (#17135)
max-krasnyansky Nov 11, 2025
23a46ce
CANN: GGML_CANN_ACL_GRAPH works only USE_ACL_GRAPH enabled (#16861)
rauletorresc Nov 12, 2025
5da7664
[SYCL]fix ci crash about SSM_CONV (#17169)
NeoZhangJianyu Nov 12, 2025
655cddd
CANN: Add L2_NORM op support (#16856)
TecJesh Nov 12, 2025
78010a0
cmake : move OpenSSL linking to vendor/cpp-httplib (#17177)
angt Nov 12, 2025
52cf111
cmake : cleanup (#17199)
angt Nov 12, 2025
1c398dc
ggml-cpu: handle 3d tensors in repack mat_mul (#17030)
Alcpz Nov 12, 2025
ee8dd5c
server: move res_error/res_ok to static function (#17167)
ngxson Nov 12, 2025
017ecee
ci: add check vendor job (#17179)
ngxson Nov 12, 2025
00c9408
server: (refactor) implement generator-based API for task results (#1…
ngxson Nov 12, 2025
8e878f0
Update packages + upgrade Storybook to v10 (#17201)
allozaur Nov 12, 2025
374fe09
ggml : use std::sort in ggml_argsort CPU implementation (#17211)
ggerganov Nov 12, 2025
92bb442
docker : preserve .so symlinks for docker container builds (#17214)
furrysalamander Nov 12, 2025
5d6838b
CUDA: static assert to prevent misuse of memcpy_1 (#17198)
JohannesGaessler Nov 12, 2025
ffb6f3d
vocab : correct bounds check for UGM XCDA array access (#17215)
o7si Nov 12, 2025
07751f8
update SYCL support OPs (#17208)
NeoZhangJianyu Nov 13, 2025
a90eb94
CUDA: fuse rope + set_rows (#16884)
am17an Nov 13, 2025
97d5117
CANN: Add cross_entropy_loss op support (#16886)
TecJesh Nov 13, 2025
879dec3
ggml-cpu : use template for argsort (#17222)
slaren Nov 13, 2025
2776db6
Revert "ggml-cpu: handle 3d tensors in repack mat_mul (#17030)" (#17233)
ggerganov Nov 13, 2025
0cfb191
metal: accelerated conv2d (#17175)
bghira Nov 13, 2025
1215dde
ggml-cpu : add RISC-V vector intrinsic support for silu and cvar oper…
ixgbe Nov 13, 2025
dd091e5
sched : fix reserve ignoring user tensor assignments (#17232)
slaren Nov 13, 2025
a19bd6f
vulkan: remove shell call from vulkan-shaders-gen tool, revert file c…
0cc4m Nov 13, 2025
389ac78
ggml : add ops SOFTPLUS, EXPM1, TRI, SOLVE_TRI, CUMSUM (#17063)
pwilkin Nov 13, 2025
c4abcb2
server: fixing naming conflict res_error (#17243)
ngxson Nov 13, 2025
becc481
ggml-cpu: handle 3d tensors in repack mat_mul (#17241)
Alcpz Nov 13, 2025
f1bad23
Better UX for handling multiple attachments in WebUI (#17246)
allozaur Nov 14, 2025
307772f
readme : add RVV,ZVFH,ZFH,ZICBOP support for RISC-V (#17259)
ixgbe Nov 14, 2025
2606b0a
metal : make the FA extra sizes consistent (#17143)
ggerganov Nov 14, 2025
45c6ef7
metal : support argsort for ne00 > 1024 (#17247)
ggerganov Nov 14, 2025
d396b43
server : fix "can batch with" bug (#17263)
ggerganov Nov 14, 2025
6cd0cf7
fix : Dangling pointer for non-empty trigger words in lazy grammar co…
marek-hradil Nov 14, 2025
e1fcf8b
model : add AfmoeForCausalLM support (#16477)
bartowski1182 Nov 14, 2025
9b17d74
mtmd: add mtmd_log_set (#17268)
ngxson Nov 14, 2025
38eaf32
vulkan: change graph_compute to be async and enable get_tensor_async …
jeffbolznv Nov 15, 2025
234ae7d
vulkan: skip all-negative-inf blocks in FA (#17186)
jeffbolznv Nov 15, 2025
439342e
vulkan: Use ggml_vk_tensor_subbuffer in mul_mat_vec(id) paths (#17244)
jeffbolznv Nov 15, 2025
1568d13
vulkan: implement ABS and NEG (#17245)
giuseppe Nov 15, 2025
c7b7db0
mtmd-cli: Avoid logging to stdout for model loading messages in mtmd-…
ankurvdev Nov 15, 2025
9d3ef48
convert : set expert gating func in base class (#17279)
CISC Nov 15, 2025
9a8860c
convert : use all parts in safetensors index (#17286)
CISC Nov 15, 2025
4dca015
vulkan: Replace 16-bit unpack8 calls to work around legacy Windows AM…
0cc4m Nov 15, 2025
24dc769
vulkan: Fuse mul_mat_id+add_id+mul and mul_mat+add+add. (#17287)
jeffbolznv Nov 15, 2025
662192e
convert : remove unnecessary chat template patching (#17289)
CISC Nov 15, 2025
1411d92
webui: add OAI-Compat Harmony tool-call streaming visualization and p…
ServeurpersoCom Nov 15, 2025
22e1ce2
webui: Fix clickability around chat processing statistics UI (#17278)
allozaur Nov 15, 2025
72bd732
sycl : unify unary kernels with a generic implementation and enable w…
shani-f Nov 15, 2025
4db5641
opencl: add kernel to handle mat mul in attention to improve encoding…
shaofeiqi Nov 16, 2025
52e5d42
opencl: fix rms_norm_mul (#17250)
lhez Nov 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
18 changes: 14 additions & 4 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,15 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
AttributeMacros:
- __host__
- __device__
- __global__
- __forceinline__
- __launch_bounds__
BinPackArguments: true
BinPackParameters: true # OnePerLine
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
Expand Down Expand Up @@ -70,15 +77,18 @@ ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<.*\.h>'
- Regex: '".*"'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
- Regex: '^<.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '.*'
- Regex: '^<.*'
Priority: 3
SortPriority: 0
- Regex: '.*'
Priority: 4
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
Expand Down
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Checks: >
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
-performance-enum-size,
portability-*,
-portability-simd-intrinsics,
misc-*,
Expand Down
130 changes: 130 additions & 0 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# ==============================================================================
# ARGUMENTS
# ==============================================================================

# Define the CANN base image for easier version updates later
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10

# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build

# Define the Ascend chip model for compilation. Default is Ascend910B3
ARG ASCEND_SOC_TYPE=Ascend910B3

# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum

# -- Set the working directory --
WORKDIR /app

# -- Copy project files --
COPY . .

# -- Set CANN environment variables (required for compilation) --
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# ... You can add other environment variables from the original file as needed ...
# For brevity, only core variables are listed here. You can paste the original ENV list here.

# -- Build llama.cpp --
# Use the passed ASCEND_SOC_TYPE argument and add general build options
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)

# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so*" -exec cp -P {} /app/lib \;

# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
cp build/bin/* /app/full/ && \
cp *.py /app/full/ && \
cp -r gguf-py /app/full/ && \
cp -r requirements /app/full/ && \
cp requirements.txt /app/full/
# If you have a tools.sh script, make sure it is copied here
# cp .devops/tools.sh /app/full/tools.sh

# ==============================================================================
# BASE STAGE
# Create a minimal base image with CANN runtime and common libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS base

# -- Install runtime dependencies --
RUN yum install -y libgomp curl && \
yum clean all && \
rm -rf /var/cache/yum

# -- Set CANN environment variables (required for runtime) --
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
# ... You can add other environment variables from the original file as needed ...

WORKDIR /app

# Copy compiled .so files from the build stage
COPY --from=build /app/lib/ /app

# ==============================================================================
# FINAL STAGES (TARGETS)
# ==============================================================================

### Target: full
# Complete image with all tools, Python bindings, and dependencies
# ==============================================================================
FROM base AS full

COPY --from=build /app/full /app

# Install Python dependencies
RUN yum install -y git python3 python3-pip && \
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
pip3 install --no-cache-dir -r requirements.txt && \
yum clean all && \
rm -rf /var/cache/yum

# You need to provide a tools.sh script as the entrypoint
ENTRYPOINT ["/app/tools.sh"]
# If there is no tools.sh, you can set the default to start the server
# ENTRYPOINT ["/app/llama-server"]

### Target: light
# Lightweight image containing only llama-cli
# ==============================================================================
FROM base AS light

COPY --from=build /app/full/llama-cli /app

ENTRYPOINT [ "/app/llama-cli" ]

### Target: server
# Dedicated server image containing only llama-server
# ==============================================================================
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
22 changes: 0 additions & 22 deletions .devops/cloud-v-pipeline

This file was deleted.

8 changes: 2 additions & 6 deletions .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,23 @@ FROM ubuntu:$UBUNTU_VERSION AS build

ARG TARGETARCH

ARG GGML_CPU_ARM_ARCH=armv8-a

RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev

WORKDIR /app

COPY . .

RUN if [ "$TARGETARCH" = "amd64" ]; then \
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \
fi && \
cmake --build build -j $(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
4 changes: 2 additions & 2 deletions .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down Expand Up @@ -61,7 +61,7 @@ RUN apt-get update \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
Expand Down
38 changes: 21 additions & 17 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04

## Build Image

FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
Expand All @@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand All @@ -31,7 +31,7 @@ RUN mkdir -p /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
Expand All @@ -49,19 +49,23 @@ COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

RUN apt-get update && \
apt-get install -y \
git \
python3 \
python3-pip \
python3-venv && \
python3 -m venv /opt/venv && \
. /opt/venv/bin/activate && \
pip install --upgrade pip setuptools wheel && \
pip install -r requirements.txt && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

ENV PATH="/opt/venv/bin:$PATH"

ENTRYPOINT ["/app/tools.sh"]

Expand Down
8 changes: 4 additions & 4 deletions .devops/musa.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.0.1
ARG MUSA_VERSION=rc4.3.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

FROM ${BASE_MUSA_DEV_CONTAINER} AS build

Expand Down Expand Up @@ -32,7 +32,7 @@ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
find build -name "*.so*" -exec cp -P {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
Expand Down
9 changes: 4 additions & 5 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false,
useRpc ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

# It's necessary to consistently use backendStdenv when building with CUDA support,
Expand All @@ -47,6 +48,7 @@ let
inherit (lib)
cmakeBool
cmakeFeature
optionalAttrs
optionals
strings
;
Expand Down Expand Up @@ -127,10 +129,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
};

postPatch = ''
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
'';

# With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
Expand Down Expand Up @@ -178,6 +176,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
(cmakeBool "GGML_METAL" useMetalKit)
(cmakeBool "GGML_VULKAN" useVulkan)
(cmakeBool "GGML_STATIC" enableStatic)
(cmakeBool "GGML_RPC" useRpc)
]
++ optionals useCuda [
(
Expand All @@ -197,7 +196,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
];

# Environment variables needed for ROCm
env = optionals useRocm {
env = optionalAttrs useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};
Expand Down
Loading