Skip to content

Commit 2879f38

Browse files
committed
Merge branch 'MCP-SSE-Server-support' of https://github.com/brucepro/llama.cpp into MCP-SSE-Server-support
2 parents d3ef627 + 04045bb commit 2879f38

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+4670
-1587
lines changed

.devops/musa.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
ARG UBUNTU_VERSION=22.04
22
# This needs to generally match the container host's environment.
3-
ARG MUSA_VERSION=rc3.1.0
3+
ARG MUSA_VERSION=rc3.1.1
44
# Target the MUSA build image
55
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
66

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ jobs:
443443
444444
ubuntu-22-cmake-musa:
445445
runs-on: ubuntu-22.04
446-
container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
446+
container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
447447

448448
steps:
449449
- name: Clone

README.md

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
235235
| [HIP](docs/build.md#hip) | AMD GPU |
236236
| [Vulkan](docs/build.md#vulkan) | GPU |
237237
| [CANN](docs/build.md#cann) | Ascend NPU |
238+
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
238239

239240
## Building the project
240241

@@ -518,5 +519,18 @@ If your issue is with model generation quality, then please at least scan the fo
518519
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
519520
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
520521
521-
#### References
522-
522+
## Completions
523+
Command-line completion is available for some environments.
524+
525+
#### Bash Completion
526+
```bash
527+
$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
528+
$ source ~/.llama-completion.bash
529+
```
530+
Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
531+
automatically. For example:
532+
```console
533+
$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
534+
```
535+
536+
## References

common/arg.cpp

Lines changed: 174 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,108 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
365365
print_options(specific_options);
366366
}
367367

368+
static void common_params_print_completion(common_params_context & ctx_arg) {
369+
std::vector<common_arg *> common_options;
370+
std::vector<common_arg *> sparam_options;
371+
std::vector<common_arg *> specific_options;
372+
373+
for (auto & opt : ctx_arg.options) {
374+
if (opt.is_sparam) {
375+
sparam_options.push_back(&opt);
376+
} else if (opt.in_example(ctx_arg.ex)) {
377+
specific_options.push_back(&opt);
378+
} else {
379+
common_options.push_back(&opt);
380+
}
381+
}
382+
383+
printf("_llama_completions() {\n");
384+
printf(" local cur prev opts\n");
385+
printf(" COMPREPLY=()\n");
386+
printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
387+
printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
388+
389+
printf(" opts=\"");
390+
auto print_options = [](const std::vector<common_arg *> & options) {
391+
for (const common_arg * opt : options) {
392+
for (const char * arg : opt->args) {
393+
printf("%s ", arg);
394+
}
395+
}
396+
};
397+
398+
print_options(common_options);
399+
print_options(sparam_options);
400+
print_options(specific_options);
401+
printf("\"\n\n");
402+
403+
printf(" case \"$prev\" in\n");
404+
printf(" --model)\n");
405+
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
406+
printf(" return 0\n");
407+
printf(" ;;\n");
408+
printf(" --grammar-file)\n");
409+
printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
410+
printf(" return 0\n");
411+
printf(" ;;\n");
412+
printf(" *)\n");
413+
printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
414+
printf(" return 0\n");
415+
printf(" ;;\n");
416+
printf(" esac\n");
417+
printf("}\n\n");
418+
419+
std::set<std::string> executables = {
420+
"llama-batched",
421+
"llama-batched-bench",
422+
"llama-bench",
423+
"llama-cli",
424+
"llama-convert-llama2c-to-ggml",
425+
"llama-cvector-generator",
426+
"llama-embedding",
427+
"llama-eval-callback",
428+
"llama-export-lora",
429+
"llama-gbnf-validator",
430+
"llama-gen-docs",
431+
"llama-gguf",
432+
"llama-gguf-hash",
433+
"llama-gguf-split",
434+
"llama-gritlm",
435+
"llama-imatrix",
436+
"llama-infill",
437+
"llama-llava-cli",
438+
"llama-llava-clip-quantize-cli",
439+
"llama-lookahead",
440+
"llama-lookup",
441+
"llama-lookup-create",
442+
"llama-lookup-merge",
443+
"llama-lookup-stats",
444+
"llama-minicpmv-cli",
445+
"llama-parallel",
446+
"llama-passkey",
447+
"llama-perplexity",
448+
"llama-q8dot",
449+
"llama-quantize",
450+
"llama-quantize-stats",
451+
"llama-qwen2vl-cli",
452+
"llama-retrieval",
453+
"llama-run",
454+
"llama-save-load-state",
455+
"llama-server",
456+
"llama-simple",
457+
"llama-simple-chat",
458+
"llama-speculative",
459+
"llama-speculative-simple",
460+
"llama-tokenize",
461+
"llama-tts",
462+
"llama-vdot"
463+
};
464+
465+
for (const auto& exe : executables) {
466+
printf("complete -F _llama_completions %s\n", exe.c_str());
467+
}
468+
}
469+
368470
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
369471
std::vector<ggml_backend_dev_t> devices;
370472
auto dev_names = string_split<std::string>(value, ',');
@@ -426,6 +528,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
426528
}
427529
exit(0);
428530
}
531+
if (ctx_arg.params.completion) {
532+
common_params_print_completion(ctx_arg);
533+
exit(0);
534+
}
429535
} catch (const std::invalid_argument & ex) {
430536
fprintf(stderr, "%s\n", ex.what());
431537
ctx_arg.params = params_org;
@@ -494,6 +600,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
494600
exit(0);
495601
}
496602
));
603+
add_opt(common_arg(
604+
{"--completion-bash"},
605+
"print source-able bash completion script for llama.cpp",
606+
[](common_params & params) {
607+
params.completion = true;
608+
}
609+
));
497610
add_opt(common_arg(
498611
{"--verbose-prompt"},
499612
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -674,7 +787,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
674787
));
675788
add_opt(common_arg(
676789
{"--no-context-shift"},
677-
string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
790+
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
678791
[](common_params & params) {
679792
params.ctx_shift = false;
680793
}
@@ -946,6 +1059,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
9461059
params.sampling.min_p = std::stof(value);
9471060
}
9481061
).set_sparam());
1062+
add_opt(common_arg(
1063+
{"--top-nsigma"}, "N",
1064+
string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
1065+
[](common_params & params, const std::string & value) {
1066+
params.sampling.top_n_sigma = std::stof(value);
1067+
}
1068+
).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
9491069
add_opt(common_arg(
9501070
{"--xtc-probability"}, "N",
9511071
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@@ -1975,6 +2095,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19752095
params.use_jinja = true;
19762096
}
19772097
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
2098+
add_opt(common_arg(
2099+
{"--reasoning-format"}, "FORMAT",
2100+
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2101+
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2102+
"only supported for non-streamed responses",
2103+
[](common_params & params, const std::string & value) {
2104+
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2105+
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2106+
else { std::invalid_argument("invalid value"); }
2107+
}
2108+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
19782109
add_opt(common_arg(
19792110
{"--chat-template"}, "JINJA_TEMPLATE",
19802111
string_format(
@@ -2324,5 +2455,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23242455
}
23252456
).set_examples({LLAMA_EXAMPLE_TTS}));
23262457

2458+
add_opt(common_arg(
2459+
{"--embd-bge-small-en-default"},
2460+
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
2461+
[](common_params & params) {
2462+
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
2463+
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
2464+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2465+
params.embd_normalize = 2;
2466+
params.n_ctx = 512;
2467+
params.verbose_prompt = true;
2468+
params.embedding = true;
2469+
}
2470+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2471+
2472+
add_opt(common_arg(
2473+
{"--embd-e5-small-en-default"},
2474+
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
2475+
[](common_params & params) {
2476+
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
2477+
params.hf_file = "e5-small-v2-q8_0.gguf";
2478+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2479+
params.embd_normalize = 2;
2480+
params.n_ctx = 512;
2481+
params.verbose_prompt = true;
2482+
params.embedding = true;
2483+
}
2484+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2485+
2486+
add_opt(common_arg(
2487+
{"--embd-gte-small-default"},
2488+
string_format("use default gte-small model (note: can download weights from the internet)"),
2489+
[](common_params & params) {
2490+
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
2491+
params.hf_file = "gte-small-q8_0.gguf";
2492+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2493+
params.embd_normalize = 2;
2494+
params.n_ctx = 512;
2495+
params.verbose_prompt = true;
2496+
params.embedding = true;
2497+
}
2498+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2499+
23272500
return ctx_arg;
23282501
}

common/chat-template.hpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -249,16 +249,30 @@ class chat_template {
249249
inputs.add_generation_prompt = false;
250250
full = apply(inputs);
251251
}
252-
253-
if (full.find(prefix) != 0) {
254-
if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
255-
prefix = prefix.substr(0, prefix.size() - eos_token_.size());
252+
auto eos_pos_last = full.rfind(eos_token_);
253+
if (eos_pos_last == prefix.size() - eos_token_.size() ||
254+
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
255+
full = full.substr(0, eos_pos_last);
256+
}
257+
size_t common_prefix_length = 0;
258+
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
259+
if (prefix[i] != full[i]) {
260+
break;
256261
}
262+
if (prefix[i] == '<') {
263+
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
264+
// but it removes thinking tags for past messages.
265+
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
266+
continue;
267+
}
268+
common_prefix_length = i + 1;
257269
}
258-
if (full.find(prefix) != 0) {
270+
auto example = full.substr(common_prefix_length);
271+
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
259272
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
273+
} else {
274+
tool_call_example_ = example;
260275
}
261-
tool_call_example_ = full.substr(prefix.size());
262276
}
263277
} catch (const std::exception & e) {
264278
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@@ -363,7 +377,7 @@ class chat_template {
363377
if (polyfill_tools) {
364378
adjusted_messages = add_system(inputs.messages,
365379
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
366-
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
380+
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
367381
} else {
368382
adjusted_messages = inputs.messages;
369383
}

0 commit comments

Comments
 (0)