Skip to content

Commit 072280e

Browse files
committed
Merge branch 'master' into gg/llama-kv-cache
ggml-ci
2 parents f95b04a + 4806498 commit 072280e

34 files changed

+2088
-1203
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ examples/server/*.css.hpp
9898
examples/server/*.html.hpp
9999
examples/server/*.js.hpp
100100
examples/server/*.mjs.hpp
101+
examples/server/*.gz.hpp
101102
!build_64.sh
102103
!examples/*.bat
103104
!examples/*/*.kts

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Pull requests (for contributors)
22

3+
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
34
- Test your changes:
45
- Execute [the full CI locally on your machine](ci/README.md) before publishing
56
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1364,7 +1364,7 @@ llama-server: \
13641364
examples/server/index.html.hpp \
13651365
examples/server/loading.html.hpp \
13661366
common/chat.cpp \
1367-
common/chat.hpp \
1367+
common/chat.h \
13681368
common/chat-template.hpp \
13691369
common/json.hpp \
13701370
common/minja.hpp \

common/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ add_library(${TARGET} STATIC
5757
arg.h
5858
base64.hpp
5959
chat.cpp
60-
chat.hpp
61-
chat-template.hpp
60+
chat.h
6261
common.cpp
6362
common.h
6463
console.cpp
@@ -68,7 +67,8 @@ add_library(${TARGET} STATIC
6867
llguidance.cpp
6968
log.cpp
7069
log.h
71-
minja.hpp
70+
minja/chat-template.hpp
71+
minja/minja.hpp
7272
ngram-cache.cpp
7373
ngram-cache.h
7474
sampling.cpp

common/arg.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "log.h"
44
#include "sampling.h"
5+
#include "chat.h"
56

67
#include <algorithm>
78
#include <climits>
@@ -2501,5 +2502,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25012502
}
25022503
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
25032504

2505+
add_opt(common_arg(
2506+
{"--fim-qwen-1.5b-default"},
2507+
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2508+
[](common_params & params) {
2509+
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2510+
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2511+
params.port = 8012;
2512+
params.n_gpu_layers = 99;
2513+
params.flash_attn = true;
2514+
params.n_ubatch = 1024;
2515+
params.n_batch = 1024;
2516+
params.n_ctx = 0;
2517+
params.n_cache_reuse = 256;
2518+
}
2519+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2520+
2521+
add_opt(common_arg(
2522+
{"--fim-qwen-3b-default"},
2523+
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2524+
[](common_params & params) {
2525+
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2526+
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2527+
params.port = 8012;
2528+
params.n_gpu_layers = 99;
2529+
params.flash_attn = true;
2530+
params.n_ubatch = 1024;
2531+
params.n_batch = 1024;
2532+
params.n_ctx = 0;
2533+
params.n_cache_reuse = 256;
2534+
}
2535+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2536+
2537+
add_opt(common_arg(
2538+
{"--fim-qwen-7b-default"},
2539+
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2540+
[](common_params & params) {
2541+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2542+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2543+
params.port = 8012;
2544+
params.n_gpu_layers = 99;
2545+
params.flash_attn = true;
2546+
params.n_ubatch = 1024;
2547+
params.n_batch = 1024;
2548+
params.n_ctx = 0;
2549+
params.n_cache_reuse = 256;
2550+
}
2551+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2552+
25042553
return ctx_arg;
25052554
}

0 commit comments

Comments
 (0)