Skip to content

Commit 972f91c

Browse files
committed
Merge branch 'master' into gg/llama-kv-cache
ggml-ci
2 parents b15fede + d7b31a9 commit 972f91c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+8547
-3622
lines changed

.github/workflows/server.yml

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,36 @@ jobs:
8181
with:
8282
node-version: '22.11.0'
8383

84+
- name: WebUI - Install dependencies
85+
id: webui_lint
86+
run: |
87+
cd examples/server/webui
88+
npm ci
89+
90+
- name: WebUI - Check code format
91+
id: webui_format
92+
run: |
93+
git config --global --add safe.directory $(realpath .)
94+
cd examples/server/webui
95+
git status
96+
97+
npm run format
98+
git status
99+
modified_files="$(git status -s)"
100+
echo "Modified files: ${modified_files}"
101+
if [ -n "${modified_files}" ]; then
102+
echo "Files do not follow coding style. To fix: npm run format"
103+
echo "${modified_files}"
104+
exit 1
105+
fi
106+
84107
- name: Verify bundled index.html
85108
id: verify_server_index_html
86109
run: |
87110
git config --global --add safe.directory $(realpath .)
88111
cd examples/server/webui
89112
git status
90-
npm ci
113+
91114
npm run build
92115
git status
93116
modified_files="$(git status -s)"

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,4 +233,4 @@ configure_file(cmake/llama.pc.in
233233
@ONLY)
234234

235235
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
236-
DESTINATION lib/pkgconfig)
236+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)

cmake/llama.pc.in

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
prefix=@CMAKE_INSTALL_PREFIX@
2-
exec_prefix=${prefix}
3-
libdir=${exec_prefix}/lib
4-
includedir=${prefix}/include
2+
exec_prefix=@CMAKE_INSTALL_PREFIX@
3+
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
4+
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
55

66
Name: llama
77
Description: Port of Facebook's LLaMA model in C/C++
8-
Version: @PROJECT_VERSION@
9-
Libs: -L${libdir} -lggml -lggml-base -lllama
8+
Version: @LLAMA_INSTALL_VERSION@
9+
Libs: -L${libdir} -lggml -lggml-base -lllama
1010
Cflags: -I${includedir}

common/arg.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2324,5 +2324,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23242324
}
23252325
).set_examples({LLAMA_EXAMPLE_TTS}));
23262326

2327+
add_opt(common_arg(
2328+
{"--embd-bge-small-en-default"},
2329+
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
2330+
[](common_params & params) {
2331+
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
2332+
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
2333+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2334+
params.embd_normalize = 2;
2335+
params.n_ctx = 512;
2336+
params.verbose_prompt = true;
2337+
params.embedding = true;
2338+
}
2339+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2340+
2341+
add_opt(common_arg(
2342+
{"--embd-e5-small-en-default"},
2343+
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
2344+
[](common_params & params) {
2345+
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
2346+
params.hf_file = "e5-small-v2-q8_0.gguf";
2347+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2348+
params.embd_normalize = 2;
2349+
params.n_ctx = 512;
2350+
params.verbose_prompt = true;
2351+
params.embedding = true;
2352+
}
2353+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2354+
2355+
add_opt(common_arg(
2356+
{"--embd-gte-small-default"},
2357+
string_format("use default gte-small model (note: can download weights from the internet)"),
2358+
[](common_params & params) {
2359+
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
2360+
params.hf_file = "gte-small-q8_0.gguf";
2361+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2362+
params.embd_normalize = 2;
2363+
params.n_ctx = 512;
2364+
params.verbose_prompt = true;
2365+
params.embedding = true;
2366+
}
2367+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2368+
23272369
return ctx_arg;
23282370
}

common/chat-template.hpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -249,16 +249,30 @@ class chat_template {
249249
inputs.add_generation_prompt = false;
250250
full = apply(inputs);
251251
}
252-
253-
if (full.find(prefix) != 0) {
254-
if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
255-
prefix = prefix.substr(0, prefix.size() - eos_token_.size());
252+
auto eos_pos_last = full.rfind(eos_token_);
253+
if (eos_pos_last == prefix.size() - eos_token_.size() ||
254+
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
255+
full = full.substr(0, eos_pos_last);
256+
}
257+
size_t common_prefix_length = 0;
258+
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
259+
if (prefix[i] != full[i]) {
260+
break;
256261
}
262+
if (prefix[i] == '<') {
263+
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
264+
// but it removes thinking tags for past messages.
265+
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
266+
continue;
267+
}
268+
common_prefix_length = i + 1;
257269
}
258-
if (full.find(prefix) != 0) {
270+
auto example = full.substr(common_prefix_length);
271+
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
259272
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
273+
} else {
274+
tool_call_example_ = example;
260275
}
261-
tool_call_example_ = full.substr(prefix.size());
262276
}
263277
} catch (const std::exception & e) {
264278
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@@ -363,7 +377,7 @@ class chat_template {
363377
if (polyfill_tools) {
364378
adjusted_messages = add_system(inputs.messages,
365379
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
366-
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
380+
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
367381
} else {
368382
adjusted_messages = inputs.messages;
369383
}

common/llguidance.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
254254
};
255255
}
256256

257-
return new llama_sampler{
257+
return llama_sampler_init(
258258
/* .iface = */ &llama_sampler_llg_i,
259-
/* .ctx = */ ctx,
260-
};
259+
/* .ctx = */ ctx
260+
);
261261
}
262262

263263
#else

common/log.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "ggml.h" // for ggml_log_level
44

5+
#define LOG_CLR_TO_EOL "\033[K\r"
56
#define LOG_COL_DEFAULT "\033[0m"
67
#define LOG_COL_BOLD "\033[1m"
78
#define LOG_COL_RED "\033[31m"

common/minja.hpp

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,6 +1385,13 @@ static std::string strip(const std::string & s) {
13851385
return s.substr(start, end - start + 1);
13861386
}
13871387

1388+
static std::string capitalize(const std::string & s) {
1389+
if (s.empty()) return s;
1390+
auto result = s;
1391+
result[0] = std::toupper(result[0]);
1392+
return result;
1393+
}
1394+
13881395
static std::string html_escape(const std::string & s) {
13891396
std::string result;
13901397
result.reserve(s.size());
@@ -1462,6 +1469,9 @@ class MethodCallExpr : public Expression {
14621469
if (method->get_name() == "strip") {
14631470
vargs.expectArgs("strip method", {0, 0}, {0, 0});
14641471
return Value(strip(str));
1472+
} else if (method->get_name() == "capitalize") {
1473+
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
1474+
return Value(capitalize(str));
14651475
} else if (method->get_name() == "endswith") {
14661476
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
14671477
auto suffix = vargs.args[0].get<std::string>();
@@ -1792,7 +1802,7 @@ class Parser {
17921802
auto left = parseStringConcat();
17931803
if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");
17941804

1795-
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
1805+
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
17961806
static std::regex not_tok(R"(not\b)");
17971807
std::string op_str;
17981808
while (!(op_str = consumeToken(compare_tok)).empty()) {
@@ -2171,7 +2181,7 @@ class Parser {
21712181
using TemplateTokenIterator = TemplateTokenVector::const_iterator;
21722182

21732183
std::vector<std::string> parseVarNames() {
2174-
static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");
2184+
static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
21752185

21762186
std::vector<std::string> group;
21772187
if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@@ -2194,13 +2204,13 @@ class Parser {
21942204
}
21952205

21962206
TemplateTokenVector tokenize() {
2197-
static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
2207+
static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
21982208
static std::regex expr_open_regex(R"(\{\{([-~])?)");
2199-
static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
2209+
static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
22002210
static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
22012211
static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
2202-
static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
2203-
static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
2212+
static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
2213+
static std::regex block_close_regex(R"(\s*([-~])?%\})");
22042214

22052215
TemplateTokenVector tokens;
22062216
std::vector<std::string> group;
@@ -2284,7 +2294,7 @@ class Parser {
22842294
auto post_space = parseBlockClose();
22852295
tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
22862296
} else if (keyword == "set") {
2287-
static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");
2297+
static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
22882298

22892299
std::string ns;
22902300
std::vector<std::string> var_names;
@@ -2336,6 +2346,11 @@ class Parser {
23362346
throw std::runtime_error("Unexpected block: " + keyword);
23372347
}
23382348
} else if (std::regex_search(it, end, match, non_text_open_regex)) {
2349+
if (!match.position()) {
2350+
if (match[0] != "{#")
2351+
throw std::runtime_error("Internal error: Expected a comment");
2352+
throw std::runtime_error("Missing end of comment tag");
2353+
}
23392354
auto text_end = it + match.position();
23402355
text = std::string(it, text_end);
23412356
it = text_end;
@@ -2400,7 +2415,7 @@ class Parser {
24002415

24012416
auto text = text_token->text;
24022417
if (post_space == SpaceHandling::Strip) {
2403-
static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
2418+
static std::regex trailing_space_regex(R"(\s+$)");
24042419
text = std::regex_replace(text, trailing_space_regex, "");
24052420
} else if (options.lstrip_blocks && it != end) {
24062421
auto i = text.size();
@@ -2410,7 +2425,7 @@ class Parser {
24102425
}
24112426
}
24122427
if (pre_space == SpaceHandling::Strip) {
2413-
static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
2428+
static std::regex leading_space_regex(R"(^\s+)");
24142429
text = std::regex_replace(text, leading_space_regex, "");
24152430
} else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
24162431
if (text.length() > 0 && text[0] == '\n') {

docs/build.md

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,21 +125,66 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md).
125125
126126
## CUDA
127127
128-
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
128+
This provides GPU acceleration using an NVIDIA GPU. Make sure to have the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) installed.
129129
130-
If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect.
130+
#### Download directly from NVIDIA
131+
You may find the official downloads here: [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads).
131132
132-
- Using `CMake`:
133133
134-
```bash
135-
cmake -B build -DGGML_CUDA=ON
136-
cmake --build build --config Release
137-
```
134+
#### Compile and run inside a Fedora Toolbox Container
135+
We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
136+
137+
**Recommended for:**
138+
139+
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
140+
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
141+
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
142+
143+
144+
### Compilation
145+
```bash
146+
cmake -B build -DGGML_CUDA=ON
147+
cmake --build build --config Release
148+
```
149+
150+
### Override Compute Capability Specifications
151+
152+
If `nvcc` cannot detect your gpu, you may get compile-warnings such as:
153+
```text
154+
nvcc warning : Cannot find valid GPU for '-arch=native', default arch is used
155+
```
138156

139-
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
157+
To override the `native` GPU detection:
158+
159+
#### 1. Take note of the `Compute Capability` of your NVIDIA devices: ["CUDA: Your GPU Compute > Capability"](https://developer.nvidia.com/cuda-gpus).
160+
161+
```text
162+
GeForce RTX 4090 8.9
163+
GeForce RTX 3080 Ti 8.6
164+
GeForce RTX 3070 8.6
165+
```
166+
167+
#### 2. Manually list each varying `Compute Capability` in the `CMAKE_CUDA_ARCHITECTURES` list.
168+
169+
```bash
170+
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
171+
```
172+
173+
### Runtime CUDA environmental variables
174+
175+
You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
176+
177+
```bash
178+
# Use `CUDA_VISIBLE_DEVICES` to hide the first compute device.
179+
CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf
180+
```
181+
182+
### Unified Memory
140183

141184
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
142185

186+
### Performance Tuning
187+
143188
The following compilation options are also available to tweak performance:
144189

145190
| Option | Legal values | Default | Description |

0 commit comments

Comments
 (0)