@@ -2571,5 +2571,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25712571        }
25722572    ).set_examples ({LLAMA_EXAMPLE_SERVER}));
25732573
2574+     add_opt (common_arg (
2575+         {" --fim-qwen-7b-spec"  },
2576+         string_format (" use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"  ),
2577+         [](common_params & params) {
2578+             params.hf_repo  = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"  ;
2579+             params.hf_file  = " qwen2.5-coder-7b-q8_0.gguf"  ;
2580+             params.speculative .hf_repo  = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"  ;
2581+             params.speculative .hf_file  = " qwen2.5-coder-0.5b-q8_0.gguf"  ;
2582+             params.speculative .n_gpu_layers  = 99 ;
2583+             params.port  = 8012 ;
2584+             params.n_gpu_layers  = 99 ;
2585+             params.flash_attn  = true ;
2586+             params.n_ubatch  = 1024 ;
2587+             params.n_batch  = 1024 ;
2588+             params.n_ctx  = 0 ;
2589+             params.n_cache_reuse  = 256 ;
2590+         }
2591+     ).set_examples ({LLAMA_EXAMPLE_SERVER}));
2592+ 
2593+     add_opt (common_arg (
2594+         {" --fim-qwen-14b-spec"  },
2595+         string_format (" use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"  ),
2596+         [](common_params & params) {
2597+             params.hf_repo  = " ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"  ;
2598+             params.hf_file  = " qwen2.5-coder-14b-q8_0.gguf"  ;
2599+             params.speculative .hf_repo  = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"  ;
2600+             params.speculative .hf_file  = " qwen2.5-coder-0.5b-q8_0.gguf"  ;
2601+             params.speculative .n_gpu_layers  = 99 ;
2602+             params.port  = 8012 ;
2603+             params.n_gpu_layers  = 99 ;
2604+             params.flash_attn  = true ;
2605+             params.n_ubatch  = 1024 ;
2606+             params.n_batch  = 1024 ;
2607+             params.n_ctx  = 0 ;
2608+             params.n_cache_reuse  = 256 ;
2609+         }
2610+     ).set_examples ({LLAMA_EXAMPLE_SERVER}));
2611+ 
25742612    return  ctx_arg;
25752613}
0 commit comments