Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
173 commits
Select commit Hold shift + click to select a range
27cb396
add geglu activation function (#14074)
huydt84 Jun 9, 2025
0a978c7
sycl: Add reorder to Q6_K mmvq implementation (#13885)
s-Nick Jun 9, 2025
e445062
webui: fix sidebar being covered by main content (#14082)
yeahdongcn Jun 9, 2025
313ebf1
CANN: Simplify the environment variable setting(#13104)
bachelor-dou Jun 9, 2025
1404858
graph : fix geglu (#14077)
ggerganov Jun 9, 2025
d9cd982
ggml-cpu : split arch-specific implementations (#13892)
xctan Jun 9, 2025
b36cf4c
llama : allow building all tests on windows when not using shared lib…
slaren Jun 9, 2025
7916c5a
sync : ggml
ggerganov Jun 10, 2025
63786e9
Vulkan: Don't default to CPU device (like llvmpipe), even if no other…
0cc4m Jun 10, 2025
cf6d8dd
ggml : fix weak alias win32 (whisper/0)
ggerganov Jun 10, 2025
fd91217
sync : ggml
ggerganov Jun 10, 2025
756410c
vulkan: force device 0 in CI (#14106)
jeffbolznv Jun 10, 2025
aac5723
llama : support GEGLU for jina-bert-v2 (#14090)
CISC Jun 10, 2025
5952acc
convert : fix duplicate key DeepSeek-R1 conversion error (#14103)
CISC Jun 10, 2025
9b7cb10
kv-cache : avoid modifying recurrent cells when setting inputs (#13834)
compilade Jun 10, 2025
b98886c
opencl: add `mul_mv_id_q4_0_f32_8x_flat` (#14003)
lhez Jun 10, 2025
095defc
vulkan: Track descriptor pools/sets per-context (#14109)
jeffbolznv Jun 11, 2025
856f024
kv-cache : add LLAMA_KV_CACHE_DEBUG environment variable (#14121)
ggerganov Jun 11, 2025
a22916c
kv-cache : relax SWA masking condition (#14119)
ggerganov Jun 11, 2025
07fae80
webui: Wrap long numbers instead of infinite horizontal scroll (#14062)
am17an Jun 11, 2025
6fe06c2
vulkan: Better thread-safety for command pools/buffers (#14116)
jeffbolznv Jun 11, 2025
d8e7703
tests : add test-tokenizers-repo (#14017)
CISC Jun 11, 2025
996c2fc
chore : clean up relative source dir paths (#14128)
CISC Jun 11, 2025
1c128f9
Implement GGML_CPU_ALL_VARIANTS for ARM (#14080)
ckastner Jun 11, 2025
0095364
kv-cache : fix split_equal handling in unified implementation (#14130)
ggerganov Jun 12, 2025
1a6b4e6
batch : remove logits_all flag (#14141)
ggerganov Jun 12, 2025
63a9403
context : simplify output counting logic during decode (#14142)
ggerganov Jun 12, 2025
dabef7e
cmake : Improve build-info.cpp generation (#14156)
ckastner Jun 13, 2025
8f86e0d
pooling : make cls_b and cls_out_b optional (#14165)
huydt84 Jun 13, 2025
bb11279
cmake: Add ability to pass in LLAMA_BUILD_NUMBER/COMMIT (#14167)
ckastner Jun 13, 2025
f07b7e6
batch : rework llama_batch_allocr (#14153)
ggerganov Jun 13, 2025
1468275
batch : add LLAMA_BATCH_DEBUG environment variable (#14172)
ggerganov Jun 13, 2025
19ace0b
Merge commit from fork
GuyGoldenberg Jun 13, 2025
056505f
vocab : fix build (#14175)
ggerganov Jun 13, 2025
c94a5fb
batch : auto-gen positions + verify multi-sequence input (#14177)
ggerganov Jun 15, 2025
3c07909
cparams : rename LLAMA_MAX_PARALLEL_SEQUENCES to LLAMA_MAX_SEQ (#14188)
ggerganov Jun 15, 2025
49209a7
model : add dots.llm1 architecture support (#14044) (#14118)
Noeda Jun 15, 2025
f59c0eb
kv-cache : fix use-after-move of defrag info (#14189)
ggerganov Jun 15, 2025
a3be98b
model : Add support for Arcee AI's upcoming AFM model (#14185)
bartowski1182 Jun 15, 2025
7f356e8
ggml-cpu : rework weak alias on apple targets (#14146)
xctan Jun 16, 2025
3b5e73c
vulkan: mutex around vkQueueSubmit (#14127)
jeffbolznv Jun 16, 2025
b6e5b86
convert : remove arcee change in convert_hf_to_gguf_update.py (#14207)
bartowski1182 Jun 16, 2025
e2c4984
ggml: Add Android support for GGML_CPU_ALL_VARIANTS (#14206)
chaxu01 Jun 16, 2025
5b64ded
llama : rework embeddings logic (#14208)
ggerganov Jun 16, 2025
d4e511a
model : add NeoBERT (#14164)
huydt84 Jun 16, 2025
0e98357
cmake: clean up external project logic for vulkan-shaders-gen (#14179)
bandoti Jun 16, 2025
4410a19
llama : add thread safety test (#14035)
slaren Jun 16, 2025
ca17789
server : fix incorrect usage of llama_get_embeddings() (#14225)
ggerganov Jun 16, 2025
6d66866
ggml-cpu : remove the weak alias trick (#14221)
xctan Jun 17, 2025
4457d07
cmake: remove shader-gen step-targets from ggml-vulkan (#14226)
bandoti Jun 17, 2025
49f52a4
examples : include examples in msvc disable warn (ggml/1270)
danbev Jun 12, 2025
65b3b01
ggml : disable warnings for tests when using MSVC (ggml/1273)
danbev Jun 13, 2025
a1c7ebb
sync : ggml
ggerganov Jun 18, 2025
a886ed5
convert : fix null head_dim AutoConfig regression (#14248)
CISC Jun 18, 2025
48f0338
ggml: Add Apple support for GGML_CPU_ALL_VARIANTS (#14258)
chaxu01 Jun 18, 2025
6140b73
docs: add s390x build documentation (#14264)
taronaeo Jun 18, 2025
13dca6e
metal : add mean kernel (#14267)
ggerganov Jun 19, 2025
20bc1a3
memory : Hybrid recurrent cache (#13979)
gabe-l-hart Jun 19, 2025
3ab5d06
Vulkan: Set device max size for host memory to avoid OOM warning and …
0cc4m Jun 19, 2025
05e1822
llamafile : support s390x SIMD instruction set (#14273)
taronaeo Jun 19, 2025
12e9427
convert : fix remote option in Windows (#14100)
pqnet Jun 19, 2025
ecfea67
build : suppress gcc15 compile warnings (#14261)
fanyang89 Jun 19, 2025
7778c04
server : add server parameters for draft model cache type (#13782)
aa956 Jun 19, 2025
056c737
ggml-cpu : remove unnecesary arm feature detection (#14281)
slaren Jun 19, 2025
f6ce51e
CUDA: add conv_2d_dw (#14265)
am17an Jun 20, 2025
db93abb
ubatch : new splitting logic (#14217)
ggerganov Jun 20, 2025
d46d89c
model : more uniform output id handling (#14275)
ggerganov Jun 20, 2025
4a409cd
ggml: Update KleidiAI to v1.9.0 (#14277)
chaxu01 Jun 20, 2025
396825e
ggml : fix repack work size for mul_mat_id (#14292)
ggerganov Jun 20, 2025
7982148
cuda : synchronize graph capture and cublas handle destruction (#14288)
slaren Jun 20, 2025
dd1e26e
llama : improve sep token handling (#14272)
CISC Jun 20, 2025
e063cec
Implement GGML_CPU_ALL_VARIANTS for PowerPC (#14286)
ckastner Jun 20, 2025
65c3447
sycl: add usage of enqueue_functions extension (#14244)
s-Nick Jun 20, 2025
c1eb2f7
vocab : prevent tokenizer overflow (#14301)
retr0reg Jun 20, 2025
013f21f
lint : remove trailing whitepace (#14304)
CISC Jun 20, 2025
8c4464b
CUDA: add conv_2d_transpose (#14287)
am17an Jun 20, 2025
ad37fb8
Add `ggml_roll` (ggml/1274)
Acly Jun 18, 2025
85bd068
sync : ggml
ggerganov Jun 20, 2025
58754ae
convert : fix Llama 4 conversion (#14311)
danielhanchen Jun 21, 2025
67bdf4b
memory : rename interface to llama_memory_context_i (#14296)
ggerganov Jun 21, 2025
19e7e05
metal : fix thread-safety (#14300)
ggerganov Jun 21, 2025
94f00fb
gguf-py : fix TemplateProcessing pair when bos/eos is missing (#14312)
CISC Jun 21, 2025
a0851ce
Add support for VK_EXT_debug_utils to add labels to Vulkan objects. (…
mtavenrath Jun 21, 2025
6e34a22
gguf-py : fix Qwen3-Embedding eos token (#14314)
CISC Jun 21, 2025
8644697
CUDA: add mean operation (#14313)
am17an Jun 22, 2025
5313f82
HIP: enable vec fattn on RDNA4 (#14323)
IMbackK Jun 22, 2025
0e3700d
examples : fix is_first logic for tokenization (#14329)
ggerganov Jun 22, 2025
d2422f2
run : avoid double tokenization (#14327)
retr0reg Jun 22, 2025
69a4ba7
gguf-py : fix SpecialVocab parsing when post_processor is null (#14330)
CISC Jun 22, 2025
15e51ab
quantize : handle user-defined pruning of whole layers (blocks) (#13037)
EAddario Jun 22, 2025
50d76b4
vulkan: update windows SDK in CI (#14334)
jeffbolznv Jun 23, 2025
5012a26
kv-cells : fix tracking of seq_pos (#14339)
ggerganov Jun 23, 2025
888016b
CUDA: mul_mat_v support for batch sizes > 1 (#14262)
JohannesGaessler Jun 23, 2025
b673834
ci: add workflow for relocatable cmake package (#14346)
bandoti Jun 23, 2025
7813ddf
CUDA/HIP: optimize mmv paths taken for HIP devices (#14324)
IMbackK Jun 23, 2025
5a7fa6f
cmake : use LLAMA_BUILD_NUMBER when defining LLAMA_INSTALL_VERSION (#…
mbaudier Jun 24, 2025
790ab01
batch : fix check for empty sequences in memory (#14364)
ggerganov Jun 24, 2025
e79eff7
opencl: ref count `ggml_backend_opencl_context` and refactor profilin…
lhez Jun 24, 2025
ac64cda
ggml-cpu: enable IBM NNPA Vector Intrinsics (#14317)
taronaeo Jun 25, 2025
619e366
musa: enable fp16 mma (all) and cublas on qy2 (#13842)
yeahdongcn Jun 26, 2025
edb1802
docs: update s390x documentation + add faq (#14389)
taronaeo Jun 26, 2025
e9ea90a
metal : batch rows copy in a single threadgroup (#14384)
ggerganov Jun 26, 2025
c089b0c
metal : add special-case mat-vec mul for ne00 == 4 (#14385)
ggerganov Jun 26, 2025
067f0fd
llama : return mistral-v7-tekken as default template only (#14390)
CISC Jun 26, 2025
504e6c5
cmake: regen vulkan shaders when shaders-gen sources change (#14398)
bandoti Jun 26, 2025
132a602
model : gemma3n text-only (#14400)
ngxson Jun 26, 2025
0db1d5e
convert : fix broken sentencepiece vocab (#14416)
CISC Jun 27, 2025
3bb8c0c
ggml : add ggml_set_rows (#14274)
rgerganov Jun 27, 2025
ed32d76
recurrent : call balloc split_reset() in init_batch() (#14414)
ggerganov Jun 27, 2025
c232ee0
graph : make llm_graph_context destructor virtual (#14410)
ggerganov Jun 27, 2025
e727777
vulkan: Fix GGML_VULKAN_SHADER_DEBUG_INFO (#14427)
jeffbolznv Jun 28, 2025
ac0a658
ci : fix windows build and release (#14431)
CISC Jun 28, 2025
aeecc1f
fix async_mode bug (#14432)
bachelor-dou Jun 28, 2025
6eab236
model : add support for ERNIE 4.5 0.3B model (#14408)
ownia Jun 28, 2025
9a5ed11
vulkan: lock accesses of pinned_memory vector (#14333)
jeffbolznv Jun 28, 2025
b5e8d12
vulkan: handle noncontig in the final case of ggml_vk_get_cpy_pipelin…
jeffbolznv Jun 28, 2025
232447d
CUDA: add bf16 and f32 support to cublas_mul_mat_batched (#14361)
am17an Jun 28, 2025
cdaa419
vulkan: Add fusion support for RMS_NORM+MUL (#14366)
jeffbolznv Jun 29, 2025
a0deecd
ggml : implement REGLU/GEGLU/SWIGLU ops (#14158)
CISC Jun 29, 2025
768e24f
ggml : fix unmerged GGML_FPxx_TO_FPxx refactoring (#14443)
CISC Jun 29, 2025
2e149f8
SYCL: disable faulty fp16 exp kernel (#14395)
qnixsynapse Jun 29, 2025
e6c926a
server : fix appearance of the chats list context menu for Safari (#1…
rntk Jun 29, 2025
7b5edf1
server : support jinja extra template kwargs (Qwen3 enable_thinking f…
matteoserva Jun 29, 2025
d037163
scripts : make the shell scripts cross-platform (#14341)
vedranmiletic Jun 30, 2025
da3403a
cmake : Remove redundant include path in CMakeLists.txt (#14452)
xiaobing318 Jun 30, 2025
648d7dc
test-backend-ops : disable llama test (#14461)
slaren Jun 30, 2025
d3ed05b
ggml-cpu: sycl: Re-enable exp f16 (#14462)
Rbiessy Jun 30, 2025
1be286a
metal : disable fast-math for some cpy kernels (#14460)
ggerganov Jun 30, 2025
82f6275
memory : correctly handle failure in apply() (#14438)
ggerganov Jun 30, 2025
27b1879
Add Conv2d for CPU (#14388)
am17an Jun 30, 2025
c59ea5e
opencl : add GEGLU, REGLU, SWIGLU (#14456)
lhez Jul 1, 2025
fdb8b79
ggml-cpu : "align corners" for bilinear upscale/downscale (ggml/1285)
Acly Jul 1, 2025
e95085a
sync : ggml
ggerganov Jul 1, 2025
11756d9
ggml : remove trailing whitespace (#0)
ggerganov Jul 1, 2025
3e0dd41
add GELU_ERF (#14455)
CISC Jul 1, 2025
9c15ba1
vulkan: Split large mul_mat_id to fit in shared memory (#14451)
jeffbolznv Jul 1, 2025
5cc2145
ci : disable fast-math for Metal GHA CI (#14478)
ggerganov Jul 1, 2025
cdbdcf4
ggml : Callback before abort (#14481)
ScaledLizard Jul 2, 2025
3d678af
github : add OpenCL backend to issue templates (#14492)
EZForever Jul 2, 2025
29b734c
ci : add OpenCL to labeler workflow (#14496)
CISC Jul 2, 2025
b858a74
opencl : update upscale to support align corners (#14488)
lhez Jul 2, 2025
9c24e25
opencl : skip empty nodes on cgraph compute (#14491)
EZForever Jul 2, 2025
e40afd8
simple-chat : fix context-exceeded condition (#14494)
ggerganov Jul 2, 2025
2b1da75
opencl : fix possible buffer overflow in dump_tensor (#14490)
jeffzhou2000 Jul 2, 2025
329c067
ggml : support bcast ggml_soft_max_ext, ggml_flash_attn_ext (#14435)
ggerganov Jun 27, 2025
1b48138
vulkan: support softmax/FA batch and broadcast (#14449)
jeffbolznv Jul 1, 2025
62a3fbb
CUDA: broadcasting for FlashAttention mask (#14500)
JohannesGaessler Jul 2, 2025
659dba8
CUDA: add softmax broadcast (#14475)
am17an Jul 2, 2025
7454d93
Set RPATH to "@loader_path" / "$ORIGIN" to ensure executables and dyn…
rotemdan Jul 2, 2025
6b57e78
ggml : add version function to get lib version (ggml/1286)
danbev Jul 2, 2025
e4aebb6
sync : ggml
ggerganov Jul 2, 2025
7871357
llama : initial Mamba-2 support (#9126)
compilade Jul 2, 2025
3f77b9b
gguf-py : add support for chat template jinja files (#14508)
CISC Jul 2, 2025
9455f96
CUDA: add dynamic shared mem to softmax, refactor general usage (#14497)
am17an Jul 2, 2025
cca7b95
ggml : remove kompute backend (#14501)
ggerganov Jul 3, 2025
ebadc44
ggml : fix FA mask dim 2 and 3 (#14505)
ggerganov Jul 3, 2025
160aeec
kv-cache : use ggml_set_rows (#14285)
ggerganov Jul 3, 2025
6a270ce
convert : correct gemma 3n conversion (#14450)
ngxson Jul 3, 2025
0412572
Fix conditional enabling following arch checks for ggml-sycl (#14504)
s-Nick Jul 3, 2025
9085ac3
ggml: backward pass for split swiglu (#14483)
JohannesGaessler Jul 3, 2025
a10a803
vulkan: support mixed/deepseekR1 FA head sizes (#14509)
jeffbolznv Jul 3, 2025
54d339e
opencl : broadcast for soft_max (#14510)
lhez Jul 3, 2025
78eadc1
ggml : implement GEGLU_ERF and GEGLU_QUICK ops (#14445)
CISC Jul 3, 2025
f37f966
CANN: Replace aclrtMemsetSync with aclnnInplaceZero operator (#14002)
luyhcsu Jul 4, 2025
9d1aee7
batch : add n_used count (#14512)
ggerganov Jul 4, 2025
f44cbba
graph : prepare for 4D mask (#14515)
ggerganov Jul 4, 2025
52132b4
batch : add optional for sequential equal split (#14511)
ggerganov Jul 4, 2025
430ab86
metal : disable fast math in all quantize kernels (#14528)
ggerganov Jul 4, 2025
4d0589a
test-backend-ops: add support for specifying output format (#14368)
yeahdongcn Jul 5, 2025
2fc8e94
eval-callback : check for empty input (#14539)
ggerganov Jul 5, 2025
1ec29d3
opencl: add GELU_ERF (#14476)
CISC Jul 5, 2025
e88a353
server : fix assistant prefilling when content is an array (#14360)
CISC Jul 5, 2025
f24278e
vulkan: Handle updated FA dim2/3 definition (#14518)
jeffbolznv Jul 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/010-bug-compilation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/011-bug-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
Expand Down
11 changes: 5 additions & 6 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute/**
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
Expand Down Expand Up @@ -93,3 +87,8 @@ Ascend NPU:
- ggml/include/ggml-cann.h
- ggml/src/ggml-cann/**
- docs/backend/CANN.md
OpenCL:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-opencl.h
- ggml/src/ggml-opencl/**
11 changes: 1 addition & 10 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -740,9 +740,6 @@ jobs:
- build: 'llvm-arm64-opencl-adreno'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
# - build: 'kompute-x64'
# arch: 'x64'
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'

steps:
- name: Clone
Expand All @@ -756,12 +753,6 @@ jobs:
variant: ccache
evict-old-files: 1d

- name: Clone Kompute submodule
id: clone_kompute
if: ${{ matrix.build == 'kompute-x64' }}
run: |
git submodule update --init ggml/src/ggml-kompute/kompute
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
Expand All @@ -777,7 +768,7 @@ jobs:
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
if: ${{ matrix.build == 'vulkan-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ jobs:
run: |
sysctl -a
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
Expand Down Expand Up @@ -103,7 +104,8 @@ jobs:
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
Expand Down Expand Up @@ -160,6 +162,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
Expand Down Expand Up @@ -211,6 +215,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
Expand Down
3 changes: 0 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
[submodule "kompute"]
path = ggml/src/ggml-kompute/kompute
url = https://github.com/nomic-ai/kompute.git
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ endfunction()

llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
Expand Down
115 changes: 111 additions & 4 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4408,9 +4408,6 @@ def __init__(self, *args, **kwargs):
]

def set_vocab(self):
with open(self.dir_model / "chat_template.jinja") as f:
# quick hack to make sure chat template is added
self.gguf_writer.add_chat_template(f.read())
super().set_vocab()

def set_gguf_parameters(self):
Expand Down Expand Up @@ -4781,6 +4778,14 @@ def set_gguf_parameters(self):
class MambaModel(TextModel):
model_arch = gguf.MODEL_ARCH.MAMBA

def __init__(self, dir_model: Path, *args, **kwargs):
# Avoid using AutoConfig for hparams
hparams = kwargs.pop("hparams", None)
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
super().__init__(dir_model, *args, hparams=hparams, **kwargs)

def set_vocab(self):
vocab_size = self.hparams["vocab_size"]
# Round vocab size to next multiple of 8
Expand Down Expand Up @@ -4855,6 +4860,100 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return [(new_name, data_torch)]


@ModelBase.register("Mamba2ForCausalLM")
class Mamba2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MAMBA2

def __init__(self, dir_model: Path, *args, **kwargs):
# Avoid using AutoConfig for hparams
# It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
hparams = kwargs.pop("hparams", None)
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
super().__init__(dir_model, *args, hparams=hparams, **kwargs)

def set_vocab(self):
vocab_size = self.hparams["vocab_size"]
# Round vocab size to next multiple of 16
pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
# pad using ceiling division
# ref: https://stackoverflow.com/a/17511341/22827863
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
self.hparams["vocab_size"] = vocab_size

if (self.dir_model / "tokenizer.model").is_file():
self._set_vocab_sentencepiece()
elif (self.dir_model / "tokenizer.model.v3").is_file():
# mamba-codestral
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
elif (self.dir_model / "tokenizer.json").is_file():
self._set_vocab_gpt2()
else:
# Use the GPT-NeoX tokenizer when no tokenizer files are present
self._set_vocab_builtin("gpt-neox", vocab_size)

def set_gguf_parameters(self):
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
head_dim = self.find_hparam(["head_dim"], optional=True) or 64
n_group = self.find_hparam(["n_groups"], optional=True) or 1

rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

# Fail early for models which don't have a block expansion factor of 2
# TODO: does this really matter?
assert d_inner == 2 * d_model
assert d_inner % head_dim == 0

self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(d_inner // head_dim)
self.gguf_writer.add_ssm_group_count(n_group)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_file_type(self.ftype)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:

if name.startswith("model.backbone") or name.startswith("model.lm_head"):
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
name = name.removeprefix("model.")

if name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"

new_name = self.map_tensor_name(name)

if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
data_torch = data_torch.squeeze()
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
gguf.MODEL_TENSOR.SSM_A,
gguf.MODEL_TENSOR.SSM_D,
]):
# unsqueeze A to use similar shape semantics as Mamba-1
# (D is also unsqueezed, but for more straightforward broadcast internally)
data_torch = data_torch.reshape((*data_torch.shape, 1))
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
n_group = self.hparams.get("n_groups", 1)
data_torch = data_torch.reshape((n_group, d_inner // n_group))

if name.endswith(".A_log"):
logger.debug("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch)

yield (new_name, data_torch)


@ModelBase.register("CohereForCausalLM")
class CommandR2Model(TextModel):
model_arch = gguf.MODEL_ARCH.COMMAND_R
Expand Down Expand Up @@ -6615,12 +6714,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# maybe we should fallback to text model's arch in that case, since not many models have both
text_config = hparams.get("text_config", {})
vision_config = hparams.get("vision_config", {})
arch = hparams["architectures"][0]
arch = None
if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
arch = arches[0]
elif "ssm_cfg" in hparams:
# For non-hf Mamba and Mamba2 models
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"

# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
arch = text_config["architectures"][0]
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
arch = vision_config["architectures"][0]
if arch is None:
raise ValueError("Failed to detect model architecture")
return arch


Expand Down
5 changes: 5 additions & 0 deletions examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ static bool run(llama_context * ctx, const common_params & params) {

std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);

if (tokens.empty()) {
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
return false;
}

if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
Expand Down
7 changes: 4 additions & 3 deletions examples/simple-chat/simple-chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,16 @@ int main(int argc, char ** argv) {
while (true) {
// check if we have enough space in the context to evaluate this batch
int n_ctx = llama_n_ctx(ctx);
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1;
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
exit(0);
}

if (llama_decode(ctx, batch)) {
GGML_ABORT("failed to decode\n");
int ret = llama_decode(ctx, batch);
if (ret != 0) {
GGML_ABORT("failed to decode, ret = %d\n", ret);
}

// sample the next token
Expand Down
9 changes: 7 additions & 2 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
Expand Down Expand Up @@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
include/ggml-cann.h
include/ggml-cpp.h
include/ggml-cuda.h
include/ggml-kompute.h
include/ggml-opt.h
include/ggml-metal.h
include/ggml-rpc.h
Expand Down Expand Up @@ -360,6 +358,13 @@ write_basic_package_version_file(
VERSION ${GGML_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)

target_compile_definitions(ggml-base PRIVATE
GGML_VERSION="${GGML_INSTALL_VERSION}"
GGML_COMMIT="${GGML_BUILD_COMMIT}"
)
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
Expand Down
50 changes: 0 additions & 50 deletions ggml/include/ggml-kompute.h

This file was deleted.

Loading