Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
201 commits
Select commit Hold shift + click to select a range
4ba9d71
metal: add neg operator (#13029)
jmorganca Apr 20, 2025
6616820
vulkan: support noncontiguous rms_norm (#13031)
jeffbolznv Apr 20, 2025
6602304
llava: fix errors in clip.h on certain compilers (#13030)
jmorganca Apr 20, 2025
2016f07
convert : experimental support for `--mmproj` flag (#13023)
ngxson Apr 20, 2025
84a9bf2
mtmd : merge llava, gemma3 and minicpmv CLI into single `llama-mtmd-c…
ngxson Apr 21, 2025
5368ddd
SYCL: Add non-contiguous support in ROPE (#12993)
qnixsynapse Apr 21, 2025
1d735c0
ggml : add SSE 4.2 and x64 base variant for CPUs without AVX (#12871)
slaren Apr 21, 2025
2434535
llava : update documentations (#13055)
ngxson Apr 22, 2025
7b53389
metal : add memory pool for temp allocs (#12850)
ggerganov Apr 22, 2025
ab47dec
security : add note about RPC and server functionality (#13061)
ggerganov Apr 22, 2025
dc39a5e
mtmd : support SmolVLM (version 1 and 2) (#13050)
ngxson Apr 22, 2025
658987c
CUDA: noncont MMVQ + batched bs1 MUL_MAT_ID (#13014)
JohannesGaessler Apr 22, 2025
2cca6c0
rpc : add command line option for number of threads for the CPU backe…
rgerganov Apr 23, 2025
eb1776b
convert : Append mult-eos,half-rope,bos to GLM4-0414 and Z (#13021)
piDack Apr 23, 2025
ecda2ec
mtmd : Support Pixtral 12B (#13065)
ngxson Apr 23, 2025
5630406
llama-mtmd-cli: Sigint rework in mtmd vision example (#13080)
pl752 Apr 23, 2025
b3b6d86
vulkan: matmul gcn tuning (#13016)
netrunnereve Apr 24, 2025
7604a7d
metal : fix floating-point range of attention scores in FA kernels (#…
ggerganov Apr 24, 2025
80982e8
arg : clean up handling --mmproj with -hf (#13082)
ngxson Apr 24, 2025
7c727fb
arg : add --no-mmproj-offload (#13093)
ngxson Apr 24, 2025
572b314
clang-tidy : disable warning about missing math parenthesis (#13091)
ggerganov Apr 24, 2025
13b4548
cmake : do not include ./src as public for libllama (#13062)
ggerganov Apr 24, 2025
b10d8bf
CUDA: use switch statements in constexpr functions (#13095)
JohannesGaessler Apr 24, 2025
c6e8cc2
ggml : Depthwise 2D convolution (ggml/1152)
Acly Apr 17, 2025
63b4911
sync : ggml
ggerganov Apr 24, 2025
87616f0
ggml : fix trailing whitespaces (#0)
ggerganov Apr 24, 2025
226251e
embeddings : fix batch sizes (#13076)
ggerganov Apr 24, 2025
13be08d
clip : remove boi/eoi embeddings for GLM-edge model (#13081)
ngxson Apr 24, 2025
553a5c3
rpc : do not wait for response when sending RPC_CMD_SET_TENSOR (#12943)
rgerganov Apr 25, 2025
514c456
change the reorder tensor from init to execute OP (#13003)
NeoZhangJianyu Apr 25, 2025
edb18b6
clip : fix pixtral on some GPU backends (#13097)
ngxson Apr 25, 2025
558a764
Force FP32 compute in GLM4 FFN Down (#13101)
city96 Apr 25, 2025
295354e
llama : fix K-shift with quantized K and BLAS backend (#13113)
slaren Apr 25, 2025
d5fe4e8
grammar : handle maxItems == 0 in JSON schema (#13117)
rick-github Apr 26, 2025
77d5e9a
ggml: move fp16/bf16 conversion optimizations to CPU backend + export…
SongXiaoXi Apr 26, 2025
4753791
clip : improve projector naming (#13118)
ngxson Apr 26, 2025
2d451c8
common : add common_remote_get_content (#13123)
ngxson Apr 26, 2025
ca2bb89
clip : Add Qwen2.5VL support (#12402)
HimariO Apr 27, 2025
59e991c
Fixes Qwen2.5VL segfault during inference with https://github.com/ggm…
LostRuins Apr 27, 2025
e291450
musa: fix build warning (#13129)
yeahdongcn Apr 27, 2025
ced44be
llama-chat : fix wrong template in GLM4-0414 (#13140)
matteoserva Apr 27, 2025
c0a97b7
llama-bench : Add `--override-tensors` arg (#12922)
4onen Apr 27, 2025
85f36e5
arg : fix unused variable (#13142)
ngxson Apr 28, 2025
69699be
CUDA: fix q_nope_absorbed prec for DS 2 Lite f16 (#13137)
JohannesGaessler Apr 28, 2025
f0dd6a1
musa: fix typo in cc control (#13144)
yeahdongcn Apr 28, 2025
e5d6c25
llama-chat : fix typo GML --> GLM (#13143)
ngxson Apr 28, 2025
43f2b07
common : fix noreturn compile warning (#13151)
ggerganov Apr 28, 2025
d0a417f
readme : update hot topics (#13150)
ggerganov Apr 28, 2025
a4c340f
SYCL: Add all missing unary kernels (#13074)
qnixsynapse Apr 28, 2025
5fa9e63
clip : refactor set input for cgraph + fix qwen2.5vl input (#13136)
ngxson Apr 28, 2025
d2b2031
llama : (mrope) allow using normal 1D position for text token (#13138)
ngxson Apr 28, 2025
fb0471d
context : do not clear output buffer on reserve (#13152)
pockers21 Apr 28, 2025
4e87962
mtmd : fix glm-edge redundant token count (#13139)
ngxson Apr 28, 2025
1831f53
llama-bench: add `-d` depth arg (#13096)
thevishalagarwal Apr 28, 2025
43ddab6
fix(rpc): Improve input validation and error handling (#13069)
thevilledev Apr 28, 2025
eaea325
clip : fix model size display (#13153)
ngxson Apr 28, 2025
5f5e39e
model : Nomic Embed Text V2 with Mixture-of-Experts (MoE) architectur…
manyoso Apr 28, 2025
b6ce743
llama-graph : fix text position for mrope (#13159)
ngxson Apr 29, 2025
e98b369
llama : set qwen3 model type sizes (#13175)
CISC Apr 29, 2025
00e3e5a
mtmd : add qwen2vl and qwen2.5vl (#13141)
ngxson Apr 29, 2025
7d3af70
llama : llm_type order by size (#13177)
CISC Apr 29, 2025
b67462c
ggml-qnn: add Qualcomm QNN backend for GGML
jeffzhou2000 Feb 14, 2025
f475838
ggml-qnn: santiy check
jeffzhou2000 Feb 15, 2025
edff40a
ggml-qnn: update script build-run-android.sh to compare peformance of…
jeffzhou2000 Feb 16, 2025
12bc7ed
ggml-qnn: fix minor issue in test-backend-ops.cpp
jeffzhou2000 Feb 17, 2025
df53005
ggml-qnn: merge QNN RPC feature from https://github.com/zhouwg/kantv/…
jeffzhou2000 Feb 18, 2025
b733ea7
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
jeffzhou2000 Feb 18, 2025
92303e5
ggml-qnn: a concise approach to offload mulmat to QNN backend(sync fr…
jeffzhou2000 Feb 19, 2025
2c041d3
ggml-qnn: remove redundant codes
jeffzhou2000 Feb 20, 2025
4907810
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
jeffzhou2000 Feb 20, 2025
55cd181
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
jeffzhou2000 Feb 20, 2025
14cad8d
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
jeffzhou2000 Feb 21, 2025
a054b47
ggml-qnn: add Qualcomm QNN backend for GGML
jeffzhou2000 Feb 14, 2025
4dce1e0
ggml-qnn: merge QNN RPC feature from https://github.com/zhouwg/kantv/…
jeffzhou2000 Feb 18, 2025
de55df2
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
jeffzhou2000 Feb 18, 2025
02e77d0
ggml-qnn: a concise approach to offload mulmat to QNN backend(sync fr…
jeffzhou2000 Feb 19, 2025
0d6dffc
ggml-qnn: remove redundant codes
jeffzhou2000 Feb 20, 2025
73bd6eb
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
jeffzhou2000 Feb 20, 2025
340dc4a
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
jeffzhou2000 Feb 20, 2025
fe181b8
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
jeffzhou2000 Feb 21, 2025
ac59dda
ggml-qnn: fix a minior typo in internal doc
jeffzhou2000 Feb 23, 2025
a8effb5
ggml-qnn: refine function ggml_qnn_create_general_tensor() to avoid c…
jeffzhou2000 Feb 23, 2025
c8058be
ggml-qnn: fix a minor typo in source code
jeffzhou2000 Feb 24, 2025
6a598ef
build: avoid ggml-qnn backend breaking other backend's builds
jeffzhou2000 Feb 24, 2025
4b3f241
ggml-qnn: remove redundant codes to make PR reviewers happy
jeffzhou2000 Feb 25, 2025
7ea4d3a
ggml-qnn: refine code format
jeffzhou2000 Feb 25, 2025
6540f50
ggml-qnn: offload quantized type mulmat to QNN backend
jeffzhou2000 Feb 26, 2025
612b572
ggml-qnn: refine source code structure to make code more clearly
jeffzhou2000 Feb 27, 2025
85b8570
ggml-qnn: enable release build with necessary logs to make reviewers …
jeffzhou2000 Feb 27, 2025
d1ba7c8
ggml-qnn: enable all quantize type with 2d mulmat
jeffzhou2000 Feb 27, 2025
e3f266a
ggml-qnn: enable log output of GGMLQNN_LOG_INFO in command line mode …
jeffzhou2000 Feb 28, 2025
4291439
ggml-qnn: Windows port --- step2
jeffzhou2000 Feb 28, 2025
12a4ad1
ggml-qnn: merge UT code and corresponding script from local dev branc…
jeffzhou2000 Mar 2, 2025
b4ee01d
ggml-qnn: merge ggml_qnn_mul_mat_4d from local dev branch to make wor…
jeffzhou2000 Mar 2, 2025
33643a9
ggml-qnn: submit AI-assisted ggml_qnn_mul_mat_4d(not worked currently…
jeffzhou2000 Mar 2, 2025
30c5719
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step2
jeffzhou2000 Mar 2, 2025
30909dd
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step3
jeffzhou2000 Mar 2, 2025
9ed3ecd
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step4
jeffzhou2000 Mar 2, 2025
a72930d
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step5
jeffzhou2000 Mar 2, 2025
eab76dd
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step6
jeffzhou2000 Mar 2, 2025
fe8bd7d
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step7
jeffzhou2000 Mar 2, 2025
1b92408
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step8
jeffzhou2000 Mar 2, 2025
5cd37f0
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- good in step9
jeffzhou2000 Mar 2, 2025
cfd0ced
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- narrow down t…
jeffzhou2000 Mar 2, 2025
e898166
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step10
jeffzhou2000 Mar 2, 2025
7bdeae0
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- narrow down t…
jeffzhou2000 Mar 2, 2025
e243ca5
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- step11
jeffzhou2000 Mar 2, 2025
e1fef6b
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 --- both ok in st…
jeffzhou2000 Mar 2, 2025
ffb119f
ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3 ---finalizing ver…
jeffzhou2000 Mar 2, 2025
58d64e7
ggml-qnn: refine ggml_qnn_mul_mat and ggml_qnn_general_node according…
jeffzhou2000 Mar 2, 2025
d5ec230
ggml-qnn: remove no-needed comments
jeffzhou2000 Mar 2, 2025
058ba0f
ggml-qnn: Windows port --- step3
jeffzhou2000 Mar 3, 2025
af5f952
ggml-qnn: remove un-needed function
jeffzhou2000 Mar 4, 2025
a622765
ggml-qnn:rebase to upstream
jeffzhou2000 Mar 4, 2025
a645a7d
ggml-qnn: fix a minior issue during rebase to upstream
jeffzhou2000 Mar 4, 2025
10fd07d
ggml-qnn: update script according to https://github.com/ggml-org/llam…
jeffzhou2000 Mar 4, 2025
c0eebf2
ggml-qnn: fix a minior issue in ggmlqnn_create_general_tensor()
jeffzhou2000 Mar 4, 2025
a69adbf
ggml-qnn: active member variable _device_id in class qnn_instance
jeffzhou2000 Mar 4, 2025
9ae6153
ggml-qnn: refine ggml_qnn_general_node and ggml_qnn_mul_mat to make c…
jeffzhou2000 Mar 4, 2025
b5c694b
ggml-qnn: Windows port --- step4
jeffzhou2000 Mar 6, 2025
eabb911
ggml-qnn: Windows port -- step5
jeffzhou2000 Mar 7, 2025
630f154
ggml-qnn: WoA(Windows on ARM) -- step6
jeffzhou2000 Mar 8, 2025
47042ee
ggml-qnn: rebase to upstream
jeffzhou2000 Mar 9, 2025
6ce5202
ggml-qnn: pr to upstream
jeffzhou2000 Mar 11, 2025
c3fd461
ggml-qnn: rebase to upstream
jeffzhou2000 Mar 18, 2025
7008c61
ggml-qnn: self code-review
jeffzhou2000 Mar 18, 2025
4bb6f0a
ggml-qnn: rebase upstream
jeffzhou2000 Mar 19, 2025
f5fcc0a
ggml-qnn: add approach through Hexagon cDSP
jeffzhou2000 Mar 22, 2025
4267a2e
ggml-qnn: refine general approach through Hexagon cDSP
jeffzhou2000 Mar 23, 2025
63a3fd2
ggml-qnn: refine the entire ggml-qnn.cpp to make code more clear
jeffzhou2000 Mar 24, 2025
2011a4e
ggml-qnn: refine the entire ggml-qnn.cpp to make code more clear
jeffzhou2000 Mar 24, 2025
184372f
ggml-qnn: add build script for libggmlop_skel.so
jeffzhou2000 Mar 24, 2025
5024133
ggml-qnn: remove redundant functions in this PR and make codes more c…
jeffzhou2000 Mar 25, 2025
244deb9
ggml-qnn: original ggml_compute_forward_add and ggml_compute_forward_…
jeffzhou2000 Mar 25, 2025
639605b
ggml-qnn: modify build-run-android.sh to verify mulmat and validate m…
jeffzhou2000 Mar 25, 2025
c2a21d2
ggml-qnn: make host code(ggml-qnn.cpp) more clear and more stable
jeffzhou2000 Mar 26, 2025
1bb49f3
ggml-qnn: refine code according to self code-review and make code mor…
jeffzhou2000 Mar 26, 2025
e69955e
ggml-qnn: offload more ggml op to Hexagon cDSP
jeffzhou2000 Mar 27, 2025
105e1cd
ggml-hexagon: code on AP(arm-cpu) side is stable now
jeffzhou2000 Mar 28, 2025
03ae20f
ggml-hexagon: optimize GGML_OP_ADD on cDSP side
jeffzhou2000 Mar 28, 2025
ba05e04
ggml-hexagon: simplify hexagon-kernel build logic in CMakeLists.txt
jeffzhou2000 Mar 29, 2025
19eb56d
ggml-hexagon: release ggml-hexagon v0.98
jeffzhou2000 Mar 29, 2025
c53f736
ggml-hexagon: release ggml-hexagon v0.99
jeffzhou2000 Mar 29, 2025
7c13b05
ggml-hexagon: try to offload q6_k mulmat to cDSP
jeffzhou2000 Mar 29, 2025
beab63f
ggml-hexagon: fix minior issue in ggml-hexagon.cpp after self code-re…
jeffzhou2000 Mar 29, 2025
6bd4231
ggml-hexagon: check validation of ggml-hexagon.cfg before create appr…
jeffzhou2000 Mar 30, 2025
6301f29
ggml-hexagon: fix all compiler warnings in ggml-hexagon.cpp
jeffzhou2000 Mar 30, 2025
57d3322
ggml-hexagon: enable only one backend device for HWACCEL_CDSP and ena…
jeffzhou2000 Mar 31, 2025
7775589
ggml-hexagon: rpc ion memory pool and test-backend-ops works fine in …
jeffzhou2000 Mar 31, 2025
2896ffc
ggml-hexagon: make comprision of mulmat performance between HWACCEL_Q…
jeffzhou2000 Mar 31, 2025
e2ae804
ggml-hexagon: release ggml-hexagon v1.00
jeffzhou2000 Mar 31, 2025
be973b4
ggml-hexagon: rebase to upstream
jeffzhou2000 Apr 1, 2025
ab2712d
ggml-hexagon: check configuration of enable_rpc_dma_mempool in functi…
jeffzhou2000 Apr 1, 2025
06d2509
ggml-hexagon: uniform rpc_ion_memsize and rpc_ion_usage between HWACC…
jeffzhou2000 Apr 1, 2025
8a5c5bd
ggml-hexagon: make buffer mechanism more clear in HWACCEL_CDSP approach
jeffzhou2000 Apr 1, 2025
119be62
ggml-hexagon: add perf function in hexagon kernerls on cDSP side
jeffzhou2000 Apr 2, 2025
895d403
ggml-hexagon: fix a stupid issue of why set rpc latency failure and i…
jeffzhou2000 Apr 2, 2025
ce4abac
ggml-hexagon: make helper function ggmlhexagon_get_timestring() threa…
jeffzhou2000 Apr 2, 2025
f0244a6
ggml-hexagon: fix a typo in ggml-hexagon.cpp
jeffzhou2000 Apr 2, 2025
478bb29
ggml-hexagon: list all known todo and fixme tasks in ggml-hexagon.cpp
jeffzhou2000 Apr 2, 2025
d914424
ggml-hexagon: fix units MB -> MiB
jeffzhou2000 Apr 2, 2025
3033280
ggml-hexagon: try to make ggml-hexagon backend works fine in a standa…
jeffzhou2000 Apr 3, 2025
374f27e
ggml-hexagon: remove reduament code and make debug log more clear
jeffzhou2000 Apr 3, 2025
bb2c663
ggml-hexagon: add gemma-3-4b-it-Q8_0.gguf to verify q8_0 mulmat on cDSP
jeffzhou2000 Apr 3, 2025
95d8ea1
ggml-hexagon:add skeleton code of offload GGML_OP_SOFT_MAX/GGML_OP_RM…
jeffzhou2000 Apr 3, 2025
507c5b2
ggml-hexagon: release ggml-dsp v0.60 on cDSP side
jeffzhou2000 Apr 4, 2025
9fc34e7
ggml-hexagon: merge build logic in kernels/Makefile to ggml-hexagon/C…
jeffzhou2000 Apr 5, 2025
0978e28
ggml-hexagon: fix a typo in ggml-hexagon.cpp
jeffzhou2000 Apr 5, 2025
0229a59
ggml-hexagon: uniform NDEBUG usage in ggml-hexagon.cpp and ggml-dsp.c
jeffzhou2000 Apr 6, 2025
e5da565
ggml-hexagon: add profiler feature for purpose of visualize NPU perfo…
jeffzhou2000 Apr 7, 2025
4e31ae2
ggml-hexagon: remove so-called dma memory pool to avoid confusion and…
jeffzhou2000 Apr 8, 2025
abe6857
ggml-hexagon: make function ggmlhexagon_init_rpcmempool in ggml-hexag…
jeffzhou2000 Apr 8, 2025
1bfae35
ggml-hexagon: fix potential resource leak in class hexagon_profiler
jeffzhou2000 Apr 8, 2025
200dae8
ggml-hexagon: enable multi-threading feature on cDSP side
jeffzhou2000 Apr 8, 2025
8b9375d
ggml-hexagon: upgrade QNN SDK to v2.33.0.250327
jeffzhou2000 Apr 9, 2025
a339a4d
ggml-hexagon: fix typo in ggml-hexagon.cpp
jeffzhou2000 Apr 9, 2025
6b47f08
ggml-dsp: probe QuRT RTOS information in function ggmlop_dsp_open
jeffzhou2000 Apr 9, 2025
3b5f172
ggml-hexagon: setting enable_rpc_ion_mempool to 1 and make test-backe…
jeffzhou2000 Apr 10, 2025
68e325b
ggml-hexagon: check whether user's specified htp arch is valid in CMa…
jeffzhou2000 Apr 10, 2025
7f24cd6
ggml-hexagon: sync with upstream
jeffzhou2000 Apr 11, 2025
cb0dfd7
ggml-hexagon: refine pinned-memory feature
jeffzhou2000 Apr 11, 2025
88acf6d
ggml-hexagon: refine build system in ggml-hexagon
jeffzhou2000 Apr 11, 2025
67c7d06
ggml-hexagon: remove redundant code in struct ggml_backend_hexagon_bu…
jeffzhou2000 Apr 11, 2025
36c3ff6
ggml-hexagon: upgrade Android NDK to android-ndk-r28
jeffzhou2000 Apr 11, 2025
57cfbbe
ggml-dsp: split ggml-dsp.c into multiple files and cleanup
jeffzhou2000 Apr 11, 2025
bcb5012
ggml-dsp: refine ggml-dsp and make ggml-dsp more clear
jeffzhou2000 Apr 12, 2025
6931510
ggml-hexagon: fix a minior issue in dev ops
jeffzhou2000 Apr 12, 2025
c45cd5e
ggml-hexagon: fix a build issue in CI
jeffzhou2000 Apr 12, 2025
7b55a46
ggml-dsp: cleanup code
jeffzhou2000 Apr 15, 2025
6f11897
ggml-hexagon: sync with upstream
jeffzhou2000 Apr 15, 2025
157b6b1
ggml-dsp: cleanup code
jeffzhou2000 Apr 16, 2025
d4afea4
ggml-dsp:refine ggmlhexagon_dsp_add_f32
jeffzhou2000 Apr 16, 2025
2862e27
ggml-dsp: refine logic of thread_counts
jeffzhou2000 Apr 17, 2025
c36bd93
ggml-hexagon: release v1.06 and ready for code review
jeffzhou2000 Apr 17, 2025
4f70d23
ggml-dsp: make GGML_OP_ADD more faster on cDSP side
jeffzhou2000 Apr 19, 2025
7b00b51
ggml-hexagon: sync from project kantv(make ggml-hexagon backend can w…
jeffzhou2000 Apr 24, 2025
b6072fa
sync with upstream llama.cpp and sync ggml-hexagon.cpp from project k…
jeffzhou2000 Apr 29, 2025
fe88096
Merge pull request #62 from zhouwg/pr_to_upstream
l3utterfly Apr 30, 2025
28565d1
Enhance ggml_backend_reg_layla to support Hexagon backend and update …
l3utterfly Apr 30, 2025
1e9db91
Refactor memory allocation and method stubs in ggml-hexagon
l3utterfly May 16, 2025
6d8ad6f
Implement file management for libggmlop-skel.so based on DSP architec…
l3utterfly May 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Checks: >
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
-readability-math-missing-parentheses,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
Expand Down
15 changes: 15 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,20 @@ set(CMAKE_WARN_UNUSED_CLI YES)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

if(CMAKE_SYSTEM_NAME STREQUAL "Android")
if(DEFINED HTP_ARCH_VERSION)
if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
#works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
message("OPT_FLAG:${OPT_FLAG}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
endif()
endif()
endif()

if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
Expand Down Expand Up @@ -119,6 +133,7 @@ llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
llama_option_depr(WARNING LLAMA_HEXAGON GGML_HEXAGON)

if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

## Hot topics

- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
Expand Down
3 changes: 2 additions & 1 deletion SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
### Untrusted environments or networks

If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
* Encrypt your data if sending it over the network.

### Multi-Tenant environments
Expand Down
191 changes: 135 additions & 56 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@

using json = nlohmann::ordered_json;

std::initializer_list<enum llama_example> mmproj_examples = {
LLAMA_EXAMPLE_LLAVA,
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
};

common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
this->examples = std::move(examples);
return *this;
Expand Down Expand Up @@ -157,6 +162,10 @@ struct common_hf_file_res {

#ifdef LLAMA_USE_CURL

bool common_has_curl() {
return true;
}

#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
Expand Down Expand Up @@ -522,64 +531,89 @@ static bool common_download_model(
return true;
}

/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}

// fetch model info from Hugging Face Hub API
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::vector<char> res_buffer;

std::string model_endpoint = get_model_endpoint();

std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
auto data_vec = static_cast<std::vector<char> *>(data);
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!bearer_token.empty()) {
std::string auth_header = "Authorization: Bearer " + bearer_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
if (params.timeout > 0) {
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
}
if (params.max_size > 0) {
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
for (const auto & header : params.headers) {
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
}
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);

CURLcode res = curl_easy_perform(curl.get());

if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
std::string error_msg = curl_easy_strerror(res);
throw std::runtime_error("error: cannot make GET request: " + error_msg);
}

long res_code;
std::string ggufFile = "";
std::string mmprojFile = "";
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);

return { res_code, std::move(res_buffer) };
}

/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}

std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;

// headers
std::vector<std::string> headers;
headers.push_back("Accept: application/json");
if (!bearer_token.empty()) {
headers.push_back("Authorization: Bearer " + bearer_token);
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
// User-Agent header is already set in common_remote_get_content, no need to set it here

// make the request
common_remote_params params;
params.headers = headers;
auto res = common_remote_get_content(url, params);
long res_code = res.first;
std::string res_str(res.second.data(), res.second.size());
std::string ggufFile;
std::string mmprojFile;

if (res_code == 200) {
// extract ggufFile.rfilename in json, using regex
{
Expand Down Expand Up @@ -613,6 +647,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_

#else

bool common_has_curl() {
return false;
}

static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
LOG_ERR("error: built without CURL, cannot download model from internet\n");
return false;
Expand All @@ -635,17 +673,30 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
return {};
}

std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
if (!url.empty()) {
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
}

return {};
}

#endif // LLAMA_USE_CURL

//
// utils
//

static void common_params_handle_model(
struct handle_model_result {
bool found_mmproj = false;
common_params_model mmproj;
};

static handle_model_result common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
const std::string & model_path_default,
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
const std::string & model_path_default) {
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
{
if (!model.hf_repo.empty()) {
Expand All @@ -657,7 +708,12 @@ static void common_params_handle_model(
exit(1); // built without CURL, error message already printed
}
model.hf_repo = auto_detected.repo;
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
model.hf_file = auto_detected.ggufFile;
if (!auto_detected.mmprojFile.empty()) {
result.found_mmproj = true;
result.mmproj.hf_repo = model.hf_repo;
result.mmproj.hf_file = auto_detected.mmprojFile;
}
} else {
model.hf_file = model.path;
}
Expand Down Expand Up @@ -694,6 +750,8 @@ static void common_params_handle_model(
exit(1);
}
}

return result;
}

const std::vector<ggml_type> kv_cache_types = {
Expand Down Expand Up @@ -827,16 +885,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}

common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
common_params_handle_model(params.speculative.model, params.hf_token, "");
common_params_handle_model(params.vocoder.model, params.hf_token, "");

// allow --mmproj to be set from -hf
// assuming that mmproj is always in the same repo as text model
if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
params.mmproj.hf_repo = params.model.hf_repo;
// handle model and download
{
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
// optionally, handle mmproj model when -hf is specified
params.mmproj = res.mmproj;
}
// only download mmproj if the current example is using it
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, "");
break;
}
}
common_params_handle_model(params.speculative.model, params.hf_token, "");
common_params_handle_model(params.vocoder.model, params.hf_token, "");
}
common_params_handle_model(params.mmproj, params.hf_token, "", true);

if (params.escape) {
string_process_escapes(params.prompt);
Expand Down Expand Up @@ -968,28 +1035,25 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-embedding",
"llama-eval-callback",
"llama-export-lora",
"llama-gbnf-validator",
"llama-gen-docs",
"llama-gguf",
"llama-gguf-hash",
"llama-gguf-split",
"llama-gritlm",
"llama-imatrix",
"llama-infill",
"llama-llava-cli",
"llama-mtmd-cli",
"llama-llava-clip-quantize-cli",
"llama-lookahead",
"llama-lookup",
"llama-lookup-create",
"llama-lookup-merge",
"llama-lookup-stats",
"llama-minicpmv-cli",
"llama-parallel",
"llama-passkey",
"llama-perplexity",
"llama-q8dot",
"llama-quantize",
"llama-quantize-stats",
"llama-qwen2vl-cli",
"llama-retrieval",
"llama-run",
Expand Down Expand Up @@ -2096,18 +2160,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
add_opt(common_arg(
{"--mmproj"}, "FILE",
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
"path to a multimodal projector file. see examples/llava/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.path = value;
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples(mmproj_examples));
add_opt(common_arg(
{"--mmproj-url"}, "URL",
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
"URL to a multimodal projector file. see examples/llava/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.url = value;
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples(mmproj_examples));
add_opt(common_arg(
{"--no-mmproj"},
"explicitly disable multimodal projector, useful when using -hf",
[](common_params & params) {
params.no_mmproj = true;
}
).set_examples(mmproj_examples));
add_opt(common_arg(
{"--no-mmproj-offload"},
"do not offload multimodal projector to GPU",
[](common_params & params) {
params.mmproj_use_gpu = false;
}
).set_examples(mmproj_examples));
add_opt(common_arg(
{"--image"}, "FILE",
"path to an image file. use with multimodal models. Specify multiple times for batching",
Expand Down Expand Up @@ -2382,6 +2460,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
"example: unsloth/phi-4-GGUF:q4_k_m\n"
"(default: unused)",
[](common_params & params, const std::string & value) {
Expand Down Expand Up @@ -2726,7 +2805,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
Expand Down
Loading
Loading