Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
4427 commits
Select commit Hold shift + click to select a range
02cdd2d
sycl: simplify bin_bcast_kernel (#13383)
AD2605 May 15, 2025
c531edf
convert : fix conversion for llama 4 (#13567)
ngxson May 15, 2025
07ad2b6
gguf-py : fix disconnect-before-connect in editor-gui (#13569)
danielzgtg May 15, 2025
c6a2c9e
gguf : use ggml log system (#13571)
slaren May 15, 2025
bc098c3
minja: sync (qwen3) (#13573)
ochafik May 15, 2025
0a338ed
sycl : fixed compilation warnings (#13582)
lslusarczyk May 16, 2025
7c07ac2
ci : add ppc64el to build-linux-cross (#13575)
CISC May 16, 2025
5364ae4
llama : print hint when loading a model when no backends are loaded (…
slaren May 16, 2025
654a677
metal : add FA-vec kernel for head size 64 (#13583)
ggerganov May 16, 2025
415e40a
releases : use arm version of curl for arm releases (#13592)
slaren May 16, 2025
06c1e4a
readme : add list of dependencies and their license (#13591)
ngxson May 16, 2025
aea9f8b
webui : improve accessibility for visually impaired people (#13551)
ngxson May 16, 2025
6aa892e
server : do not return error out of context (with ctx shift disabled)…
ngxson May 16, 2025
3e0be1c
llguidance : official v0.7.20 release (no actual changes) [noci] (#13…
CoffeeVampir3 May 16, 2025
4f41ee1
vulkan: use scalar FA rather than coopmat2 when N==1 (#13554)
jeffbolznv May 17, 2025
2f5a4e1
vulkan: move common FA code to flash_attn_base.comp (#13556)
jeffbolznv May 17, 2025
518329b
parallel : add option for non-shared and larger prompts (#13598)
ggerganov May 17, 2025
e3a7cf6
cmake: use the current build config for vulkan-shaders-gen (#13595)
giladgd May 17, 2025
6a2bc8b
server : added --no-prefill-assistant flag (#13608)
isaac-mcfadyen May 17, 2025
33d7aed
CANN: Support MOE Model MUL_MAT_ID (#13042)
noemotiovon May 19, 2025
9c55e5c
fix: check model pointer validity before use (#13631)
D2hugging May 19, 2025
60aea02
ggml : Fix missing backtrace on Linux (ggml/1228)
danielzgtg May 17, 2025
8b5e19a
ggml : fix apple OS check in ggml_print_backtrace (ggml/1229)
slaren May 19, 2025
6c35981
mnist: fix segmentation fault (ggml/1227)
JohannesGaessler May 19, 2025
d30cb5a
sync : ggml
ggerganov May 19, 2025
f71f40a
ci : upgraded oneAPI version in SYCL workflows and dockerfile (#13532)
Alcpz May 19, 2025
92ecdcc
mtmd : add vision support for llama 4 (#13282)
ngxson May 19, 2025
725f23f
sycl : backend documentation review (#13544)
Alcpz May 19, 2025
8960efd
Vulkan: Add f32 accumulator support to quantized mul mat to fix GLM4 …
0cc4m May 19, 2025
1dfbf2c
common : add load_progress_callback (#13617)
psocolovsky May 19, 2025
f7c9429
sycl : Overcoming workaround for mmap() allocation on Windows (#13482)
s-Nick May 20, 2025
f0adb80
CANN: Update CANN model support (#13162)
bachelor-dou May 20, 2025
e298d2f
kv-cache : add SWA support (#13194)
ggerganov May 20, 2025
c00a263
metal : fix typo in FA kernel comments (#13651)
ggerganov May 20, 2025
c9c64de
Set GLM4 blk.*.attn_output.weight, kqv_out-* matmul to GGML_PREC_F32 …
0cc4m May 20, 2025
4245e62
sycl: disable reorder for sycl mulmat (#13536)
sgeor255 May 20, 2025
759e37b
tests : avoid github urls due to throttling (#13654)
CISC May 20, 2025
b69f164
CUDA: skip fully masked-out KV in FA vec kernel (#13584)
JohannesGaessler May 20, 2025
a4090d1
llama : remove llama_kv_cache_view API + remove deprecated (#13653)
ggerganov May 20, 2025
be02396
model : fix llama4 graph (#13663)
ggerganov May 20, 2025
b7a1746
mtmd-helper : bug fix to token batching in mtmd (#13650)
l3utterfly May 20, 2025
fb1cab2
vulkan: fix warnings (#13626)
netrunnereve May 20, 2025
3398305
musa: Upgrade MUSA SDK version to rc4.0.1 and use mudnn::Unary::IDENT…
yeahdongcn May 21, 2025
b44890d
model : disable SWA for Phi models (#13676)
ggerganov May 21, 2025
797f2ac
kv-cache : simplify the interface (#13660)
ggerganov May 21, 2025
42158ae
server : fix first message identification (#13634)
doringeman May 21, 2025
0d5c742
server : Add the endpoints /api/tags and /api/chat (#13659)
R-Dson May 21, 2025
cf4cb59
ggml : add ggml_gelu_erf() (#13667)
ngxson May 21, 2025
eb0f5c2
gguf-py : display the invalid gguf type (#13687)
emmanuel-ferdman May 21, 2025
2aa777d
examples : switch retrieval to llama_encode (#13685)
CISC May 21, 2025
c76532e
convert : add qwen2vl support for unsloth merges (#13686)
antichristHater May 21, 2025
5fbfe38
server : improve error reporting (#13680)
ggerganov May 21, 2025
8e186ef
hparams : support models for which all layers use SWA (#13682)
ggerganov May 21, 2025
d643bb2
releases : build CPU backend separately (windows) (#13642)
slaren May 21, 2025
edbf42e
opencl: fix couple crashes (#12795)
May 21, 2025
a4e8912
opencl: Add support for multiple devices (#12622)
May 21, 2025
6b56a64
SYCL: Avoid using with SYCL-Graph for unsupported nodes (#13587)
May 22, 2025
d394a9a
sycl : Remove waits from function calls (#13702)
s-Nick May 22, 2025
5be24af
gguf-py : correct charsmap parameter typing (#13701)
CISC May 22, 2025
cc74d5b
server : pad small embedding batches (#13692)
ggerganov May 22, 2025
ab86335
common: Include torch package for s390x (#13699)
taronaeo May 22, 2025
797990c
mtmd : add ultravox audio input (#13623)
ngxson May 22, 2025
8a1d206
tts : fix n_ubatch + make WavTokenizer cache-less (#13713)
ggerganov May 22, 2025
3079e9a
release : fix windows hip release (#13707)
slaren May 22, 2025
a127ff1
use LOG_WARN to replace `std::cerr` (#13657)
foldl May 23, 2025
c10ed6c
vulkan: Disable coopmat/coopmat2/bfloat extensions if glslc doesn't s…
jeffbolznv May 23, 2025
1dcd019
vulkan: support CPY from any type to itself (#13695)
jeffbolznv May 23, 2025
e16c473
ggml : fix the order of ggml_unary_op (#13718)
ngxson May 23, 2025
faaaff5
CANN: Support MUL_MAT_ID for q8_0 and q4_0 (#13705)
noemotiovon May 23, 2025
9ecf3e6
server : support audio input (#13714)
ngxson May 23, 2025
8a2afb7
llama : allow custom list of swa_layers (#13726)
ngxson May 23, 2025
d13d0f6
hparams : initialize arrays (#13728)
ggerganov May 23, 2025
a70a8a6
ci : add winget package updater (#13732)
slaren May 23, 2025
b775345
ci : enable winget package updates (#13734)
slaren May 23, 2025
ffd0eae
CUDA: fix race condition in FA vector kernels (#13742)
JohannesGaessler May 24, 2025
c3a2624
vocab : fix ugm tokenizer precision (#13743)
CISC May 24, 2025
4c32832
ggml : add ggml_gelu_erf() CUDA kernel (#13719)
ngxson May 24, 2025
259469c
Move GLM4 f32 attention fix to the correct function (#13750)
0cc4m May 24, 2025
2bd1b30
ggml-cpu : set openmp wait time if not set (#13758)
slaren May 24, 2025
17fc817
releases : enable openmp in windows cpu backend build (#13756)
slaren May 24, 2025
a2d02d5
releases : bundle llvm omp library in windows release (#13763)
slaren May 24, 2025
f5cd27b
`server`: streaming of tool calls and thoughts when `--jinja` is on (…
ochafik May 25, 2025
515fdbf
SYCL: revert "sycl: simplify bin_bcast_kernel (#13383)" (#13752)
qnixsynapse May 25, 2025
4032ca4
llama : add support for Qwen3 MoE tied word embeddings (#13768)
estibi May 25, 2025
d785f9c
server: fix/test add_generation_prompt (#13770)
ochafik May 25, 2025
a08c1d2
docs : add Moondream2 pre-quantized link (#13745)
ddpasa May 25, 2025
40aaa8a
mtmd : add support for Qwen2-Audio and SeaLLM-Audio (#13760)
ngxson May 25, 2025
c508256
rpc : Fix build on OpenBSD (#13541)
percypiper May 25, 2025
de2ef53
kv-cache : rework kv_cell (#13706)
ggerganov May 25, 2025
aa50ba4
tests : improve UGM tokenizer test coverage (#13773)
CISC May 25, 2025
2f099b5
webui : bump max upload file size to 500MB (#13779)
ngxson May 25, 2025
e121edc
`server`: add `--reasoning-budget 0` to disable thinking (incl. qwen3…
ochafik May 25, 2025
2d38b6e
CANN: Add the basic supports of Flash Attention kernel (#13627)
shibizhao May 26, 2025
fef693d
vulkan: mark IM2COL as supporting non-contig (#13783)
jeffbolznv May 26, 2025
9012eb9
sycl: Add more debug prints (#13640)
Rbiessy May 26, 2025
2222931
llama : clarify deprecation message (#13794)
ggerganov May 26, 2025
79c137f
examples : allow extracting embeddings from decoder contexts (#13797)
ggerganov May 26, 2025
f13847c
server: fix regression on streamed non-chat completion w/ stops (#13785)
ochafik May 26, 2025
d74e94c
`server`: fix format of streamed tool call deltas (diff name, fix id …
ochafik May 26, 2025
88c125f
examples/training: Fix file name in README (#13803)
standby24x7 May 26, 2025
03f582a
server: fix streaming crashes (#13786)
ochafik May 26, 2025
6f180b9
SYCL: Add non contiguous support in RMS_NORM and NORM kernels (#13611)
qnixsynapse May 26, 2025
4265a87
cuda : avoid cuGetErrorString (#13791)
ggerganov May 26, 2025
a26c4cc
scripts : add option to compare commits in Debug (#13806)
ggerganov May 26, 2025
cdf94a1
server: --offline mode (#13804)
ochafik May 26, 2025
4f81b33
llama : validate seq id batch input (#13809)
ggerganov May 27, 2025
f9cd683
sampling : make sure samplers return at least 1 token (#13822)
ggerganov May 27, 2025
8171312
kv-cells : track min/max used cells and per-sequence positions (#13808)
ggerganov May 27, 2025
952f395
ggml : allow CUDA graphs when using pipeline parallelism (#13814)
slaren May 27, 2025
7fe03e7
ggml-cpu: x86 feature detection is specific to x86 (#13811)
ckastner May 27, 2025
72b090d
docs: remove link for llama-cli function calling (#13810)
bandoti May 27, 2025
bc583e3
mtmd : support Qwen 2.5 Omni (input audio+vision, no audio output) (#…
ngxson May 27, 2025
05f6ac6
ggml : riscv: add xtheadvector support (#13720)
xctan May 27, 2025
a8ea03d
ggml : add ggml_repeat_4d (#13824)
ngxson May 27, 2025
1c49c70
sync : ggml
ggerganov May 27, 2025
f3101a8
SYCL: add gelu_erf kernel (#13749)
qnixsynapse May 27, 2025
34b7c04
cmake : add llama-cparams.cpp to build (#13832)
ggerganov May 27, 2025
bef8176
vulkan: use timestamp queries for GGML_VULKAN_PERF (#13817)
jeffbolznv May 27, 2025
1701d4c
opencl: mark `mul_mat` `f32f32` as supporting non-contiguous tensors …
lhez May 27, 2025
a3c3084
opencl: add new ops - `argsort`, `div`, `sub`, `addrows`, `sigmoid`, …
lhez May 27, 2025
1e8659e
CANN: Add SOC TYPE printing in cmake configuration (#13837)
leo-pony May 28, 2025
26b79b6
convert : fix tensor naming conflict for llama 4 vision (#13836)
ngxson May 28, 2025
a682474
CUDA: fix FA tg at long context for CC >= 8.9 (#13852)
JohannesGaessler May 28, 2025
f7873fc
tests : change umlaut test (#11600)
n00b001 May 28, 2025
a3938fb
convert : fix qwen omni conversion (#13859)
ngxson May 28, 2025
c962ae3
server: fix remove 'image_url'/'input_audio' json-object effectlly fo…
flyinskyin2013 May 28, 2025
aa6dff0
convert: small addition to support LlamaModel (#13838)
huydt84 May 28, 2025
e0e3aa2
llama : add support for BertForSequenceClassification reranker (#13858)
huydt84 May 28, 2025
d98f2a3
ci: disable LLAMA_CURL for Linux cross-builds (#13871)
bandoti May 28, 2025
1096133
mtmd : move helpers to dedicated library (⚠️ breaking change) (#13866)
ngxson May 28, 2025
763d06e
llama : fix KV shift for qwen2vl (#13870)
ngxson May 28, 2025
53ae306
gguf-py : fix SafetensorRemote return on undefined size (< 0) (#13841)
Beinsezii May 28, 2025
1b8fb81
ggml: aarch64: Implement SVE F32 kernels for vector functions (#13843)
vineelabhinav May 29, 2025
6385b84
llama : add RobertaForSequenceClassification reranker support (#13875)
CISC May 29, 2025
5ca82fc
convert : workaround for AutoConfig dummy labels (#13881)
CISC May 29, 2025
66c9206
tests : remove json.hpp from a test (#13880)
ggerganov May 29, 2025
dd8ba93
ggml: aarch64: Implement SVE F32 kernels for Mamba Sequential Scan Al…
vineelabhinav May 29, 2025
21fcc21
cmake: Factor out CPU architecture detection (#13883)
ckastner May 29, 2025
54a2c7a
arm64: optimize q4_k_q8_k kernel with i8mm (#13886)
cyb70289 May 29, 2025
2b13162
gguf-py : add support for sub_type (in arrays) in GGUFWriter add_key_…
CISC May 29, 2025
e83ba3e
llama : add support for jina-reranker-v2 (#13900)
CISC May 29, 2025
ec9e030
cmake: Guard GGML_CPU_ALL_VARIANTS by architecture (#13890)
ckastner May 29, 2025
2c90da4
llama : use llm_build_granite for minicpm (#13911)
zkh2016 May 30, 2025
291f2b6
llama : add support for DistilBert (#13907)
huydt84 May 30, 2025
07e4351
convert : allow partial update to the chkhsh pre-tokenizer list (#13847)
ngxson May 30, 2025
db38704
convert : fix rwkv bos/eos token (#13844)
CISC May 30, 2025
53f9250
sync : vendor (#13901)
ggerganov May 30, 2025
b49a8ff
SYCL: Add mrope kernel (#13755)
qnixsynapse May 30, 2025
df0c0c7
cuda : prevent using split buffers with 3d/4d matrices (#13919)
slaren May 30, 2025
dd665cc
parallel : increase the variability of the prompt lengths (#13927)
ggerganov May 30, 2025
b47ab7b
sched : avoid changing cur_copy when a graph is already allocated (#1…
slaren May 30, 2025
e562eec
CUDA: fix typo in FlashAttention code (#13926)
JohannesGaessler May 30, 2025
eb39499
CUDA: add a prop in ggml_cuda_device_infor for distinguish iGPU or dG…
Yangxiaoz May 31, 2025
12d0188
kv-cache : refactor + add llama_memory_state_i (#13746)
ggerganov May 31, 2025
51fa76f
mtmd : drop `_shared` from `libmtmd` name, merge helpers into libmtmd…
ngxson May 31, 2025
3f55f78
llama : auto-batch preparation (#13845)
ggerganov May 31, 2025
c7e0a20
webui : Replace alert and confirm with custom modals. (#13711)
igardev May 31, 2025
3600cc2
llama : use n_swa + n_ubatch cells for SWA cache (#13833)
ggerganov May 31, 2025
803f8ba
llama : deprecate explicit kv_self defrag/update calls (#13921)
ggerganov May 31, 2025
e15898d
server: allow unclosed thinking tags (#13931)
ochafik May 31, 2025
b3a89c3
docs : Note about necessity of having libcurl installed for standard …
jpodivin May 31, 2025
053b153
threading: support for GGML_SCHED_PRIO_LOW, update thread info on Win…
max-krasnyansky May 31, 2025
0fc16b4
kv-cache : split implementation in separate sources (#13920)
ggerganov Jun 1, 2025
c046217
parallel : fix n_junk == 0 (#13952)
ggerganov Jun 1, 2025
8726392
readme : update bindings (#13950)
ddh0 Jun 1, 2025
fedf034
ggml : Print backtrace on uncaught C++ exceptions (ggml/1232)
danielzgtg May 28, 2025
6eba72b
ggml : install dynamic backends (ggml/1240)
rgerganov May 29, 2025
a7b8d35
sync : whisper.cpp (ggml/1250)
ggerganov May 29, 2025
af6f91d
ggml : remove ggml_graph_import and ggml_graph_export declarations (g…
rgerganov May 30, 2025
d337252
cmake : Fix broken CMake error messages (ggml/1252)
dg0yt May 31, 2025
108009f
vulkan : Remove unexpected ; (ggml/1253)
dg0yt May 31, 2025
f3a4b16
sync : ggml
ggerganov Jun 1, 2025
e57bb87
ggml: check if non-native endian model is being loaded (#13943)
taronaeo Jun 1, 2025
c496fe0
convert : fix vocab padding code for bert models (#13954)
CISC Jun 1, 2025
5e1c3ae
convert : fix nomic-bert-moe mask token (#13757)
CISC Jun 1, 2025
7675c55
gguf: fix failure on version == 0 (#13956)
JohannesGaessler Jun 1, 2025
663445b
sycl: quantize and reorder the input to q8_1 when reorder is enabled …
AD2605 Jun 2, 2025
093e3f1
cmake : Handle mixed-case 'Power' strings in POWER CPU detection (#13…
shalinib-ibm Jun 2, 2025
bfd3227
mtmd : fix memory leak in mtmd_helper_eval_chunk_single (#13961)
ngxson Jun 2, 2025
c9bbc77
`server`: update deepseek reasoning format (pass reasoning_content as…
ochafik Jun 2, 2025
5582c49
gemma : more consistent attention scaling for v2 and v3 (#13951)
ggerganov Jun 2, 2025
ea394d7
metal : use F32 accumulators in FA kernels (#13975)
ggerganov Jun 2, 2025
3637576
server : disable speculative decoding for SWA models (#13970)
ggerganov Jun 2, 2025
bfb1e01
OpenCL: Add concat, tsembd, upscale, tanh, pad and repeat (#13840)
rmatif Jun 2, 2025
71e74a3
opencl: add `backend_synchronize` (#13939)
lhez Jun 2, 2025
ea1431b
docs : add "Quick start" section for new users (#13862)
ngxson Jun 3, 2025
7e00e60
vulkan: fix warnings in perf logger querypool code (#13937)
jeffbolznv Jun 3, 2025
e0e806f
kv-cache : fix unified::seq_rm to work with seq_id < 0 (#13985)
ggerganov Jun 4, 2025
0b4be4c
CUDA: fix FTZ in FA for Gemma 3 (#13991)
JohannesGaessler Jun 4, 2025
3ac6753
llama-graph : use ggml_repeat_4d (#13998)
ngxson Jun 4, 2025
4825487
releases : use dl backend for linux release, remove arm64 linux relea…
slaren Jun 4, 2025
2589ad3
ci : remove cuda 11.7 releases, switch runner to windows 2022 (#13997)
slaren Jun 4, 2025
3e63a58
kv-cache : refactor the update/defrag mechanism (#13988)
ggerganov Jun 4, 2025
0d39844
ggml-vulkan: adds support for op CONV_TRANSPOSE_1D (#13813)
etasnadi Jun 4, 2025
5a8ae30
vulkan: automatically deduce size of push constants (#13936)
jeffbolznv Jun 5, 2025
9e31bec
context : fix pos_min initialization upon error decode (#14008)
ggerganov Jun 5, 2025
9f47fa5
vocab : warn about missing mask token (#14022)
CISC Jun 5, 2025
d01d112
readme : add badge (#13938)
Olexandr88 Jun 5, 2025
3a07714
llama : allow using mmap without PrefetchVirtualMemory, apply GGML_WI…
slaren Jun 5, 2025
7f37b6c
memory : migrate from llama_kv_cache to more generic llama_memory (#1…
ggerganov Jun 5, 2025
146b88e
ci: fix CUDA build failure on autodl cloud machines (#14005)
pockers21 Jun 5, 2025
669c13e
vulkan: Enable VK_KHR_cooperative_matrix extension for Intel Xe2 GPUs…
rillomas Jun 5, 2025
1caae7f
gguf-py : add add_classifier_output_labels method to writer (#14031)
CISC Jun 5, 2025
d17a809
llama : support multiple classifier outputs and labels (#13940)
CISC Jun 6, 2025
487a5e0
context : fix SWA-related warning for multiple sequences (#14045)
ggerganov Jun 6, 2025
745aa53
llama : deprecate llama_kv_self_ API (#14030)
ggerganov Jun 6, 2025
0974ad7
llama : fix llama_model_chat_template with template name (LLM_KV with…
CISC Jun 7, 2025
228f34c
SYCL: Implement few same quantized type copy kernels (#13739)
qnixsynapse Jun 7, 2025
5787b5d
ci: add LoongArch cross-compile build (#13944)
wojiushixiaobai Jun 7, 2025
247e5c6
cuda : fix buffer type check with integrated GPUs (#14069)
slaren Jun 8, 2025
056eb74
CANN: Enable labeler for Ascend NPU (#13914)
shink Jun 9, 2025
91a8ee6
add geglu activation function (#14074)
huydt84 Jun 9, 2025
b460d16
sycl: Add reorder to Q6_K mmvq implementation (#13885)
s-Nick Jun 9, 2025
87d34b3
server : fix LRU check (#14079)
ggerganov Jun 9, 2025
dc0623f
webui: fix sidebar being covered by main content (#14082)
yeahdongcn Jun 9, 2025
e21d2d4
CANN: Simplify the environment variable setting(#13104)
bachelor-dou Jun 9, 2025
201b31d
graph : fix geglu (#14077)
ggerganov Jun 9, 2025
8f47e25
cuda : fix device sync on buffer clear (#14033)
slaren Jun 9, 2025
f470bc3
ggml-cpu : split arch-specific implementations (#13892)
xctan Jun 9, 2025
7f4fbe5
llama : allow building all tests on windows when not using shared lib…
slaren Jun 9, 2025
40cbf57
kv-cache : fix shift and defrag logic (#14081)
ggerganov Jun 9, 2025
1f63e75
metal : use less stack memory in FA kernel (#14088)
ggerganov Jun 9, 2025
1a3b5e8
Add in-build ggml::ggml ALIAS library (ggml/1260)
dg0yt Jun 3, 2025
b8e2194
sync : ggml
ggerganov Jun 10, 2025
2bb0467
rpc : nicer error messages for RPC server crash (#14076)
isaac-mcfadyen Jun 10, 2025
cb3bf57
vulkan : fix build failure caused by vulkan-shaders-gen install
AsbjornOlling Jun 5, 2025
97340b4
Vulkan: Don't default to CPU device (like llvmpipe), even if no other…
0cc4m Jun 10, 2025
b7ce1ad
ggml : fix weak alias win32 (whisper/0)
ggerganov Jun 10, 2025
ae92c18
sync : ggml
ggerganov Jun 10, 2025
3a12db2
Fixed spec timings to: accepted/tested instead of accepted/drafted (#…
jukofyork Jun 10, 2025
652b70e
vulkan: force device 0 in CI (#14106)
jeffbolznv Jun 10, 2025
3678b83
llama : support GEGLU for jina-bert-v2 (#14090)
CISC Jun 10, 2025
55f6b9f
convert : fix duplicate key DeepSeek-R1 conversion error (#14103)
CISC Jun 10, 2025
dad5c44
kv-cache : avoid modifying recurrent cells when setting inputs (#13834)
compilade Jun 10, 2025
4c763c8
opencl: add `mul_mv_id_q4_0_f32_8x_flat` (#14003)
lhez Jun 10, 2025
1f7d50b
vulkan: Track descriptor pools/sets per-context (#14109)
jeffbolznv Jun 11, 2025
7ae2932
kv-cache : add LLAMA_KV_CACHE_DEBUG environment variable (#14121)
ggerganov Jun 11, 2025
2baf077
server : pass default --keep argument (#14120)
MightyAlex200 Jun 11, 2025
89a184f
kv-cache : relax SWA masking condition (#14119)
ggerganov Jun 11, 2025
7781e5f
webui: Wrap long numbers instead of infinite horizontal scroll (#14062)
am17an Jun 11, 2025
bd248d4
vulkan: Better thread-safety for command pools/buffers (#14116)
jeffbolznv Jun 11, 2025
cc66a7f
tests : add test-tokenizers-repo (#14017)
CISC Jun 11, 2025
d4e0d95
chore : clean up relative source dir paths (#14128)
CISC Jun 11, 2025
532802f
Implement GGML_CPU_ALL_VARIANTS for ARM (#14080)
ckastner Jun 11, 2025
2e89f76
common: fix issue with regex_escape routine on windows (#14133)
bandoti Jun 11, 2025
a20b2b0
context : round n_tokens to next multiple of n_seqs when reserving (#…
compilade Jun 12, 2025
9596506
kv-cache : fix split_equal handling in unified implementation (#14130)
ggerganov Jun 12, 2025
e2c0b6e
cmake : handle whitepsaces in path during metal build (#14126)
ggerganov Jun 12, 2025
82bea12
Merge branch 'master' into fix-vulkan-build-failure-again
AsbjornOlling Jun 12, 2025
c2e9de3
vulkan : try to fix windows build by branching on cmake generators
AsbjornOlling Jun 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
161 changes: 161 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
---
Language: Cpp
AlignAfterOpenBracket: Align
AlignArrayOfStructures: Left
AlignConsecutiveAssignments: AcrossComments
AlignConsecutiveBitFields: AcrossComments
AlignConsecutiveDeclarations: AcrossComments
AlignConsecutiveMacros: AcrossComments
# AlignConsecutiveShortCaseStatements: AcrossComments
AlignEscapedNewlines: Left # LeftWithLastLine
AlignOperands: Align
AlignTrailingComments:
Kind: Always
OverEmptyLines: 1
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: false
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: true
BinPackParameters: true # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
AfterCaseLabel: true
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
# BreakAdjacentStringLiterals: true
BreakAfterAttributes: Never
BreakBeforeBinaryOperators: None
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: false
# BreakBinaryOperations: Never
BreakConstructorInitializers: AfterColon
# BreakFunctionDefinitionParameters: false
BreakInheritanceList: AfterComma
BreakStringLiterals: true
# BreakTemplateDeclarations: Yes
ColumnLimit: 120
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
EmptyLineBeforeAccessModifier: Leave
EmptyLineAfterAccessModifier: Never
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
Priority: 2
SortPriority: 0
- Regex: '.*'
Priority: 3
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseBlocks: true
IndentCaseLabels: true
IndentExternBlock: NoIndent
IndentGotoLabels: false
IndentPPDirectives: AfterHash
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertBraces: true # NOTE: may lead to incorrect formatting
InsertNewlineAtEOF: true
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
LambdaBodyIndentation: Signature
LineEnding: LF
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PPIndentWidth: -1
PackConstructorInitializers: CurrentLine
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Middle
QualifierAlignment: Left
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
ReferenceAlignment: Middle
ReflowComments: false # IndentOnly
SeparateDefinitionBlocks: Always
SortIncludes: CaseInsensitive
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: true
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: Never
SpacesInContainerLiterals: true
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
Standard: c++17
TabWidth: 4
UseTab: Never
WhitespaceSensitiveMacros: ['STRINGIZE']
...

4 changes: 4 additions & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ Checks: >
-readability-implicit-bool-conversion,
-readability-magic-numbers,
-readability-uppercase-literal-suffix,
-readability-simplify-boolean-expr,
-readability-math-missing-parentheses,
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
portability-*,
-portability-simd-intrinsics,
misc-*,
-misc-const-correctness,
-misc-non-private-member-variables-in-classes,
-misc-no-recursion,
-misc-use-anonymous-namespace,
FormatStyle: none
22 changes: 22 additions & 0 deletions .devops/cloud-v-pipeline
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
stage('Cleanup'){
cleanWs() // Cleaning previous CI build in workspace
}
stage('checkout repo'){
retry(5){ // Retry if the cloning fails due to some reason
checkout scm // Clone the repo on Runner
}
}
stage('Compiling llama.cpp'){
sh'''#!/bin/bash
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
'''
}
stage('Running llama.cpp'){
sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results
'''
}
}
92 changes: 92 additions & 0 deletions .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
ARG UBUNTU_VERSION=22.04

FROM ubuntu:$UBUNTU_VERSION AS build

ARG TARGETARCH

ARG GGML_CPU_ARM_ARCH=armv8-a

RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev

WORKDIR /app

COPY . .

RUN if [ "$TARGETARCH" = "amd64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \
fi && \
cmake --build build -j $(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

## Base image
FROM ubuntu:$UBUNTU_VERSION AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

COPY --from=build /app/lib/ /app

### Full
FROM base AS full

COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

ENTRYPOINT ["/app/tools.sh"]

### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app

WORKDIR /app

ENTRYPOINT [ "/app/llama-cli" ]

### Server, Server only
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

WORKDIR /app

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
94 changes: 94 additions & 0 deletions .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.4.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} AS build

# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1

WORKDIR /app

COPY . .

RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

COPY --from=build /app/lib/ /app

### Full
FROM base AS full

COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete


ENTRYPOINT ["/app/tools.sh"]

### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app

WORKDIR /app

ENTRYPOINT [ "/app/llama-cli" ]

### Server, Server only
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

WORKDIR /app

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
Loading
Loading