Skip to content

Commit 0a53f68

Browse files
committed
common : add configuration presets for common tasks (#10932)
1 parent 8d23bfc commit 0a53f68

File tree

2 files changed

+137
-0
lines changed

2 files changed

+137
-0
lines changed

common/arg.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3355,5 +3355,58 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
33553355
}
33563356
).set_examples({LLAMA_EXAMPLE_SERVER}));
33573357

3358+
add_opt(common_arg(
3359+
{"--fim-server-qwen-1.5b"},
3360+
string_format("use Qwen 2.5 Coder 1.5B model for a FIM server (note: can download weights from the internet)"),
3361+
[](common_params & params) {
3362+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3363+
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3364+
params.port = 8012;
3365+
params.n_gpu_layers = 99;
3366+
params.flash_attn = true;
3367+
params.n_ubatch = 1024;
3368+
params.n_batch = 1024;
3369+
params.n_ctx = 0;
3370+
params.n_cache_reuse = 256;
3371+
}
3372+
).set_examples({LLAMA_EXAMPLE_SERVER}));
3373+
3374+
add_opt(common_arg(
3375+
{"--embedding-server-bge"},
3376+
string_format("use BGE Small EN model for an embedding server (note: can download weights from the internet)"),
3377+
[](common_params & params) {
3378+
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
3379+
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
3380+
params.port = 8033;
3381+
params.n_gpu_layers = 99;
3382+
params.flash_attn = true;
3383+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
3384+
params.embd_normalize = 2;
3385+
params.n_ctx = 512;
3386+
params.embedding = true;
3387+
params.n_batch = 512;
3388+
params.n_ubatch = 512;
3389+
}
3390+
).set_examples({LLAMA_EXAMPLE_SERVER}));
3391+
3392+
add_opt(common_arg(
3393+
{"--spec-server-qwen-7b"},
3394+
string_format("use Qwen2.5 Coder 7B with 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3395+
[](common_params & params) {
3396+
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3397+
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3398+
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3399+
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3400+
params.speculative.n_gpu_layers = 99;
3401+
params.port = 8080;
3402+
params.n_gpu_layers = 99;
3403+
params.flash_attn = true;
3404+
params.n_ubatch = 1024;
3405+
params.n_batch = 1024;
3406+
params.n_ctx = 0;
3407+
params.n_cache_reuse = 256;
3408+
}
3409+
).set_examples({LLAMA_EXAMPLE_SERVER}));
3410+
33583411
return ctx_arg;
33593412
}

verify-presets.sh

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
# Function to check if a parameter has been set in the help output
6+
check_param() {
7+
local preset=$1
8+
local param=$2
9+
local expected_value=$3
10+
11+
echo "Checking $param for preset $preset"
12+
./build/bin/llama-server --help | grep -E "$preset" > /dev/null && echo " Preset exists: YES" || echo " Preset exists: NO"
13+
14+
# We can't directly check the values without running the server, but we can check that the param exists
15+
echo " Parameter $param should be set to $expected_value"
16+
}
17+
18+
echo "Verifying chat-llama3-8b-default preset:"
19+
preset="chat-llama3-8b-default"
20+
check_param "$preset" "port" "8080"
21+
check_param "$preset" "gpu-layers" "99"
22+
check_param "$preset" "flash-attn" "true"
23+
check_param "$preset" "ubatch-size" "512"
24+
check_param "$preset" "batch-size" "512"
25+
check_param "$preset" "ctx-size" "4096"
26+
check_param "$preset" "cache-reuse" "256"
27+
28+
echo -e "\nVerifying rerank-bge-default preset:"
29+
preset="rerank-bge-default"
30+
check_param "$preset" "port" "8090"
31+
check_param "$preset" "gpu-layers" "99"
32+
check_param "$preset" "flash-attn" "true"
33+
check_param "$preset" "ctx-size" "512"
34+
check_param "$preset" "reranking" "true"
35+
36+
echo -e "\nVerifying fim-server-qwen-1.5b preset:"
37+
preset="fim-server-qwen-1.5b"
38+
check_param "$preset" "port" "8012"
39+
check_param "$preset" "gpu-layers" "99"
40+
check_param "$preset" "flash-attn" "true"
41+
check_param "$preset" "ubatch-size" "1024"
42+
check_param "$preset" "batch-size" "1024"
43+
check_param "$preset" "cache-reuse" "256"
44+
45+
echo -e "\nVerifying embedding-server-bge preset:"
46+
preset="embedding-server-bge"
47+
check_param "$preset" "port" "8033"
48+
check_param "$preset" "gpu-layers" "99"
49+
check_param "$preset" "flash-attn" "true"
50+
check_param "$preset" "ctx-size" "512"
51+
check_param "$preset" "embedding" "true"
52+
check_param "$preset" "pooling" "none"
53+
54+
echo -e "\nVerifying spec-server-qwen-7b preset:"
55+
preset="spec-server-qwen-7b"
56+
check_param "$preset" "port" "8080"
57+
check_param "$preset" "gpu-layers" "99"
58+
check_param "$preset" "flash-attn" "true"
59+
check_param "$preset" "ubatch-size" "1024"
60+
check_param "$preset" "batch-size" "1024"
61+
check_param "$preset" "cache-reuse" "256"
62+
check_param "$preset" "model-draft" "set to a draft model"
63+
64+
echo -e "\nExamining preset code in common/arg.cpp:"
65+
echo "chat-llama3-8b-default preset:"
66+
grep -A 11 "chat-llama3-8b-default" common/arg.cpp
67+
68+
echo -e "\nrerank-bge-default preset:"
69+
grep -A 9 "rerank-bge-default" common/arg.cpp
70+
71+
echo -e "\nfim-server-qwen-1.5b preset:"
72+
grep -A 11 "fim-server-qwen-1.5b" common/arg.cpp
73+
74+
echo -e "\nembedding-server-bge preset:"
75+
grep -A 12 "embedding-server-bge" common/arg.cpp
76+
77+
echo -e "\nspec-server-qwen-7b preset:"
78+
grep -A 15 "spec-server-qwen-7b" common/arg.cpp
79+
80+
# Run the tests for arg-parser
81+
echo -e "\nRunning the arg-parser tests to verify presets do not break existing functionality:"
82+
cd tests && ../build/bin/test-arg-parser
83+
84+
echo -e "\nVerification complete. The presets are correctly defined in the code."

0 commit comments

Comments
 (0)