Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
105 commits
Select commit Hold shift + click to select a range
043fb27
vulkan: apply MUL_MAT_ID subgroup optimization to non-coopmat devices…
0cc4m Aug 24, 2025
c247d06
CANN: ROPE cache sin/cos repeat (#15501)
noemotiovon Aug 25, 2025
7da9fed
convert : support interns1-mini (#15412)
RunningLeon Aug 25, 2025
b0ba31f
metal : add FA kernels for HS=40 (#15559)
ggerganov Aug 25, 2025
0d5a470
convert : update Ernie 4.5 dense architecture name (#15555)
ownia Aug 25, 2025
6b64f74
batched-bench : fix unified KV cache handling + pp timing (#15562)
ggerganov Aug 25, 2025
5a6bc6b
model-conversion : add model card template for embeddings [no ci] (#1…
danbev Aug 25, 2025
dfd9b5f
model-conversion : set pooling type to none in logits.cpp (#15564)
danbev Aug 25, 2025
5eff6ec
CUDA: MoE helper in device code, better tile sizes (#15525)
JohannesGaessler Aug 25, 2025
111f8d0
metal: fix regression when no metal devices are present (#15531)
booxter Aug 25, 2025
886b97a
tests: Generate unique input values for count_equal (#15487)
jeffbolznv Aug 25, 2025
4d917cd
vulkan: fix min subgroup 16 condition for mmid subgroup optimization …
0cc4m Aug 25, 2025
f7207b0
opencl: fix support ops condition for `rms_norm` (#15560)
lhez Aug 25, 2025
74f52f7
CUDA: Accelerate MXFP4 table lookup using `__byte_perm` (#15451)
Qeeweew Aug 25, 2025
34bdbbd
vulkan: Remove splitting for mul_mat_id (#15568)
jeffbolznv Aug 26, 2025
4c37636
Add a warning for special devices (#15563)
pt13762104 Aug 26, 2025
0fd90db
metal : remove contiguous assertion for src0 in IM2COL (#15577)
CISC Aug 26, 2025
39842a7
gguf-py : remove erroneous FFN_GATE entry (#15583)
CISC Aug 26, 2025
c4e9239
model : support MiniCPM-V 4.5 (#15575)
tc-mb Aug 26, 2025
1d8d83d
metal : improve `MUL_MAT_ID` (#15541)
ggerganov Aug 26, 2025
85cc1ae
context : print graph stats for memory-less contexts (#15586)
ggerganov Aug 26, 2025
79a5462
mtmd : support Kimi VL model (#15458)
ngxson Aug 26, 2025
b3964c1
metal : optimize FA vec for large sequences and BS <= 8 (#15566)
ggerganov Aug 26, 2025
8f5afa9
CUDA: return -1 for nonexistent compiled arch (#15587)
JohannesGaessler Aug 26, 2025
62cef26
model-conversion : add qat-q4 quantization targets (#15588)
danbev Aug 26, 2025
0373486
graph : fix assert in memory-less build_attn (#15590)
ggerganov Aug 26, 2025
a6a58d6
llamafile: PowerPC Sgemm Optimization (#15558)
shalinib-ibm Aug 26, 2025
44b1efa
tests: add performance test for mul mat id (#15543)
netrunnereve Aug 26, 2025
8ce3ff1
mtmd : fix mtmd ios build (#15579)
fidoriel Aug 26, 2025
8b69686
SYCL: fix rms_norm_mul_add for tensor dim not a multiple of sg_size (…
qnixsynapse Aug 26, 2025
bcbddcd
tests : fix test-opt with GGML_BACKEND_DL (#15599)
slaren Aug 26, 2025
86076f9
OpenCL: add fused group_norm/norm, mul, add (#15314)
rmatif Aug 27, 2025
fcca218
common : add -m to bash completion for --model [no ci] (#15591)
danbev Aug 27, 2025
1cf123a
ggml-cpu : add basic RVV support for vector f32 ops (#15057)
xctan Aug 27, 2025
1e74897
CANN: refactor mask handling and improve performance in FA (#15561)
noemotiovon Aug 27, 2025
1bded5a
kv-cache : better estimate of n_kv for multi-sequence batches (#15610)
ggerganov Aug 27, 2025
4737327
HIP: Enable support for ggml_backend_cuda_register_host_buffer (#15615)
IMbackK Aug 27, 2025
da54f9f
presets : add qwen3-30B-a3b FIM (#15616)
ggerganov Aug 27, 2025
fbef0fa
server: higher timeout for tests (#15621)
JohannesGaessler Aug 27, 2025
5a0e3ef
cuda: Add cublasLt_static linking when GGML_STATIC is enabled (#15622)
matiaslin Aug 28, 2025
46d9caa
model-conversion : add mmproj conversion target (#15628)
danbev Aug 28, 2025
d35a1e8
cli : change log to warning to explain reason for stopping (#15604)
jrincayc Aug 28, 2025
64387f6
gguf-py: byteswapping improvements (#12851)
AlekseiNikiforovIBM Aug 28, 2025
8a4280c
kv-cache : remove LLAMA_SET_ROWS checks (#15505)
ggerganov Aug 28, 2025
55042b3
scripts: add sqlite3 check for compare-commits.sh (#15633)
am17an Aug 28, 2025
84ab83c
model : jina-embeddings-v3 support (#13693)
CISC Aug 28, 2025
c8d0d14
kv-cache : fix find_slot to not search for continuous slot (#15638)
ggerganov Aug 28, 2025
7380414
ggml : fix SSM_SCAN for n_groups > 1 (#15625)
compilade Aug 28, 2025
6c442f4
ggml-cpu: fix invalid hsum build in debug s390x (#15634)
taronaeo Aug 28, 2025
c97dc09
CUDA: add conv2d (#15635)
mnehete32 Aug 28, 2025
a8bca68
fix: Compute the full sum in llama-eval-callback, not just the sum of…
gabe-l-hart Aug 28, 2025
e8d99dd
nvidia nemotron nano v2 (nemotronh) (#15507)
gabe-l-hart Aug 29, 2025
009b709
CUDA: fuse adds, fuse add with rms norm (#15631)
am17an Aug 29, 2025
60e5eee
chat : Seed OSS thinking + tool call support (#15552)
pwilkin Aug 29, 2025
8101786
CUDA: fix bug in rms_norm fusion (#15660)
am17an Aug 29, 2025
792b44f
server : add documentation for `parallel_tool_calls` param (#15647)
ExtReMLapin Aug 29, 2025
3d16b29
scripts: strip "AMD Instinct" from GPU name (#15668)
JohannesGaessler Aug 29, 2025
d82f6aa
server : removed obsolete doc (#15670)
l29ah Aug 29, 2025
ef47691
CANN: FIx compiler warnings (#15661)
noemotiovon Aug 30, 2025
696fccf
vulkan: Skip syncing for prealloc_y when it is reused (#15544)
jeffbolznv Aug 30, 2025
38ad381
CUDA: use FP32 arithmetic for conv2d (#15683)
JohannesGaessler Aug 30, 2025
e81b8e4
llama: use FA + max. GPU layers by default (#15434)
JohannesGaessler Aug 30, 2025
dd89255
Update build.md to remove MSVC arm64 notes (#15684)
slaren Aug 30, 2025
4d74393
ggml: update kleidiai to v1.13.0 (#15663)
chaxu01 Aug 30, 2025
94e82c7
vulkan: clamp matmul and FA results to the max finite value (#15652)
jeffbolznv Aug 31, 2025
b97c9ed
vulkan: Allow fallback to sysmem memory when vidmem is full (#15649)
jeffbolznv Aug 31, 2025
5c16b9c
vulkan : remove unused portability_enumeration_ext variable (#15679)
danbev Aug 31, 2025
c37052a
vulkan: mul_mat_id coopmat2 optimizations (#15546)
jeffbolznv Aug 31, 2025
bbbf5ec
vulkan: handle large sizes for get_rows (#15686)
jeffbolznv Aug 31, 2025
7d3c9f2
ci : explicitly set fa off or on (#15692)
CISC Aug 31, 2025
9777032
llama : separate compute buffer reserve from fattn check (#15696)
slaren Aug 31, 2025
2749662
llama : fix fattn reserve call n_seqs parameter (#15699)
slaren Aug 31, 2025
4efd5a8
metal : fix checks for available FA kernels (#15700)
ggerganov Aug 31, 2025
0d161f0
server : enable /slots by default and make it secure (#15630)
ggerganov Aug 31, 2025
e92d53b
sampling : optimize samplers by reusing bucket sort (#15665)
ggerganov Aug 31, 2025
3dc7397
CANN: fix RoPE cache issue on multi-device (#15629)
hipudding Sep 1, 2025
b9382c3
CANN: Optimize MUL_MAT_ID (#15658)
hipudding Sep 1, 2025
b66df9d
CUDA: fix build error from ambiguous __half conversions in conv2d (#1…
qnixsynapse Sep 1, 2025
4795c91
docs : add Hunyuan to models section (#15707)
DamonFool Sep 1, 2025
77dee9d
ggml : WebGPU add TRANSPOSE and RESHAPE to supported ops (#15695)
danbev Sep 1, 2025
02c1813
Vulkan: Add Integer Dot Product mul_mat_vec shader for legacy quants …
0cc4m Sep 1, 2025
4b20d8b
convert : remove redundant code (#15708)
DamonFool Sep 1, 2025
a0c2b20
ggml: aarch64: Implement SVE F16 kernels for vector functions (#15115)
Vithulep Sep 1, 2025
078ce23
ggml: SVE support for exponential functions (#15145)
s-goto-11 Sep 1, 2025
fec7911
vulkan: disable large mmv subgroups on older Nvidia GPUs (#15717)
0cc4m Sep 1, 2025
35a42ed
vulkan: add missing clamps in new mul_mat_id paths (#15702)
jeffbolznv Sep 1, 2025
d4d8dbe
vulkan: use memory budget extension to read memory usage (#15545)
giladgd Sep 1, 2025
5d804a4
ggml-backend: raise GGML_MAX_SPLIT_INPUTS (#15722)
JohannesGaessler Sep 1, 2025
ef2af57
CANN: Support ext_factor in rope (#15710)
hipudding Sep 2, 2025
2f85368
CANN: Support eager execution mode under ACL graph compilation (#15712)
noemotiovon Sep 2, 2025
97669e4
opencl: add attn sinks support for FA kernels (#15706)
rmatif Sep 2, 2025
25f1045
vulkan: Fix macro parameter order for f32 matmul shaders (#15716)
jeffbolznv Sep 2, 2025
9961d24
CANN: Resolve soft_max precision issue (#15730)
hipudding Sep 2, 2025
0a2a384
vulkan: fix shaders gen when no integer dot is available (#15740)
0cc4m Sep 2, 2025
c466abe
llama: -fa 1/0/-1 aliases for -fa on/off/auto (#15746)
JohannesGaessler Sep 2, 2025
69db8a5
chore: Update `.clang-format` to use `BinPackArguments=true` (#15744)
ORippler Sep 2, 2025
3de0082
fix: resolve unsigned int initialization warning for n_dims/size in g…
skrandy Sep 2, 2025
8a2234e
CANN: Fix type float_t to float (#15736)
noemotiovon Sep 3, 2025
f6da8cb
CANN: Mask unsupported TRANSPOSE_1D operator (#15733)
hipudding Sep 3, 2025
8c3fdf4
model-conversion : add missing curl script [no ci] (#15761)
danbev Sep 3, 2025
05c0380
ggml-cpu : optimize RVV kernels (#15720)
xctan Sep 3, 2025
5eae934
CANN: Add RoPE contiguous check for 310I DUP device (#15735)
hipudding Sep 3, 2025
40a751e
model-conversion : remove hardcoded /bin/bash shebangs [no ci] (#15765)
danbev Sep 3, 2025
2c8dac7
llama : fix incorrect model type for Gemma 270M (#15764)
danbev Sep 3, 2025
cdedb70
sampling : optimize dist sampler (#15704)
ggerganov Sep 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: false
BinPackArguments: true
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)

#### Multimodal

Expand Down
20 changes: 10 additions & 10 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,10 @@ function gg_run_open_llama_7b_v2 {

(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

function check_ppl {
qnt="$1"
Expand Down Expand Up @@ -520,8 +520,8 @@ function gg_run_pythia_1_4b {

(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

function check_ppl {
qnt="$1"
Expand Down Expand Up @@ -651,10 +651,10 @@ function gg_run_pythia_2_8b {

(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

function check_ppl {
qnt="$1"
Expand Down
62 changes: 36 additions & 26 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
printf("\"\n\n");

printf(" case \"$prev\" in\n");
printf(" --model)\n");
printf(" --model|-m)\n");
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
printf(" return 0\n");
printf(" ;;\n");
Expand Down Expand Up @@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg(
{"-fa", "--flash-attn"},
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
[](common_params & params) {
params.flash_attn = true;
{"-fa", "--flash-attn"}, "FA",
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
[](common_params & params, const std::string & value) {
if (value == "on" || value == "enabled" || value == "1") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
} else if (value == "off" || value == "disabled" || value == "0") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
} else if (value == "auto" || value == "-1") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
}
}
).set_env("LLAMA_ARG_FLASH_ATTN"));
add_opt(common_arg(
Expand Down Expand Up @@ -2555,15 +2563,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--lora"}, "FNAME",
"path to LoRA adapter (can be repeated to use multiple adapters)",
[](common_params & params, const std::string & value) {
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg(
{"--lora-scaled"}, "FNAME", "SCALE",
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
[](common_params & params, const std::string & fname, const std::string & scale) {
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
}
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
Expand Down Expand Up @@ -2954,20 +2962,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.endpoint_metrics = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
add_opt(common_arg(
{"--slots"},
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
[](common_params & params) {
params.endpoint_slots = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
add_opt(common_arg(
{"--props"},
string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
[](common_params & params) {
params.endpoint_props = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
add_opt(common_arg(
{"--slots"},
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
[](common_params & params) {
params.endpoint_slots = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
add_opt(common_arg(
{"--no-slots"},
"disables slots monitoring endpoint",
Expand Down Expand Up @@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
Expand All @@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
Expand All @@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
Expand All @@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
Expand All @@ -3527,10 +3526,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
params.n_cache_reuse = 256;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));

add_opt(common_arg(
{"--fim-qwen-30b-default"},
string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
params.port = 8012;
params.n_ubatch = 1024;
params.n_batch = 1024;
params.n_ctx = 0;
Expand Down
Loading
Loading