Skip to content

Commit 499f60e

Browse files
authored
Merge branch 'ggml-org:master' into master
2 parents 29ac478 + 586d5fe commit 499f60e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+4186
-2173
lines changed

.github/workflows/build.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,8 @@ jobs:
374374
- name: Clone
375375
id: checkout
376376
uses: actions/checkout@v4
377+
with:
378+
fetch-depth: 0
377379

378380
- name: ccache
379381
uses: hendrikmuhs/[email protected]
@@ -1373,8 +1375,10 @@ jobs:
13731375

13741376
needs:
13751377
- ubuntu-cpu-cmake
1378+
- ubuntu-22-cmake-vulkan
13761379
- windows-latest-cmake
13771380
- windows-2019-cmake-cuda
1381+
- windows-latest-cmake-sycl
13781382
- windows-latest-cmake-hip-release
13791383
- macOS-latest-cmake-arm64
13801384
- macOS-latest-cmake-x64

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ examples/server/*.css.hpp
9898
examples/server/*.html.hpp
9999
examples/server/*.js.hpp
100100
examples/server/*.mjs.hpp
101+
examples/server/*.gz.hpp
101102
!build_64.sh
102103
!examples/*.bat
103104
!examples/*/*.kts

CONTRIBUTING.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# Pull requests (for contributors)
22

3+
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
34
- Test your changes:
45
- Execute [the full CI locally on your machine](ci/README.md) before publishing
56
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
67
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
78
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
9+
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
810
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
911
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
1012

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -847,7 +847,7 @@ ifdef GGML_MUSA
847847
CXX := $(MUSA_PATH)/bin/clang++
848848
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
849849

850-
MUSAFLAGS = -x musa -mtgpu
850+
MUSAFLAGS = -fsigned-char -x musa -mtgpu
851851
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
852852

853853
ifdef GGML_CUDA_FORCE_MMQ
@@ -1364,7 +1364,7 @@ llama-server: \
13641364
examples/server/index.html.hpp \
13651365
examples/server/loading.html.hpp \
13661366
common/chat.cpp \
1367-
common/chat.hpp \
1367+
common/chat.h \
13681368
common/chat-template.hpp \
13691369
common/json.hpp \
13701370
common/minja.hpp \

common/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ add_library(${TARGET} STATIC
5757
arg.h
5858
base64.hpp
5959
chat.cpp
60-
chat.hpp
61-
chat-template.hpp
60+
chat.h
6261
common.cpp
6362
common.h
6463
console.cpp
@@ -68,7 +67,8 @@ add_library(${TARGET} STATIC
6867
llguidance.cpp
6968
log.cpp
7069
log.h
71-
minja.hpp
70+
minja/chat-template.hpp
71+
minja/minja.hpp
7272
ngram-cache.cpp
7373
ngram-cache.h
7474
sampling.cpp

common/arg.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "log.h"
44
#include "sampling.h"
5+
#include "chat.h"
56

67
#include <algorithm>
78
#include <climits>
@@ -2501,5 +2502,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25012502
}
25022503
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
25032504

2505+
add_opt(common_arg(
2506+
{"--fim-qwen-1.5b-default"},
2507+
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2508+
[](common_params & params) {
2509+
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2510+
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2511+
params.port = 8012;
2512+
params.n_gpu_layers = 99;
2513+
params.flash_attn = true;
2514+
params.n_ubatch = 1024;
2515+
params.n_batch = 1024;
2516+
params.n_ctx = 0;
2517+
params.n_cache_reuse = 256;
2518+
}
2519+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2520+
2521+
add_opt(common_arg(
2522+
{"--fim-qwen-3b-default"},
2523+
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2524+
[](common_params & params) {
2525+
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2526+
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2527+
params.port = 8012;
2528+
params.n_gpu_layers = 99;
2529+
params.flash_attn = true;
2530+
params.n_ubatch = 1024;
2531+
params.n_batch = 1024;
2532+
params.n_ctx = 0;
2533+
params.n_cache_reuse = 256;
2534+
}
2535+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2536+
2537+
add_opt(common_arg(
2538+
{"--fim-qwen-7b-default"},
2539+
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2540+
[](common_params & params) {
2541+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2542+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2543+
params.port = 8012;
2544+
params.n_gpu_layers = 99;
2545+
params.flash_attn = true;
2546+
params.n_ubatch = 1024;
2547+
params.n_batch = 1024;
2548+
params.n_ctx = 0;
2549+
params.n_cache_reuse = 256;
2550+
}
2551+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2552+
25042553
return ctx_arg;
25052554
}

0 commit comments

Comments
 (0)