Skip to content

Commit 698baca

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 9c652f4 + 369be55 commit 698baca

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+2242
-926
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
235235
| [HIP](docs/build.md#hip) | AMD GPU |
236236
| [Vulkan](docs/build.md#vulkan) | GPU |
237237
| [CANN](docs/build.md#cann) | Ascend NPU |
238+
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
238239

239240
## Building the project
240241

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
674674
));
675675
add_opt(common_arg(
676676
{"--no-context-shift"},
677-
string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
677+
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
678678
[](common_params & params) {
679679
params.ctx_shift = false;
680680
}

common/chat-template.hpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -249,16 +249,30 @@ class chat_template {
249249
inputs.add_generation_prompt = false;
250250
full = apply(inputs);
251251
}
252-
253-
if (full.find(prefix) != 0) {
254-
if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
255-
prefix = prefix.substr(0, prefix.size() - eos_token_.size());
252+
auto eos_pos_last = full.rfind(eos_token_);
253+
if (eos_pos_last == prefix.size() - eos_token_.size() ||
254+
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
255+
full = full.substr(0, eos_pos_last);
256+
}
257+
size_t common_prefix_length = 0;
258+
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
259+
if (prefix[i] != full[i]) {
260+
break;
256261
}
262+
if (prefix[i] == '<') {
263+
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
264+
// but it removes thinking tags for past messages.
265+
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
266+
continue;
267+
}
268+
common_prefix_length = i + 1;
257269
}
258-
if (full.find(prefix) != 0) {
270+
auto example = full.substr(common_prefix_length);
271+
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
259272
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
273+
} else {
274+
tool_call_example_ = example;
260275
}
261-
tool_call_example_ = full.substr(prefix.size());
262276
}
263277
} catch (const std::exception & e) {
264278
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@@ -363,7 +377,7 @@ class chat_template {
363377
if (polyfill_tools) {
364378
adjusted_messages = add_system(inputs.messages,
365379
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
366-
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
380+
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
367381
} else {
368382
adjusted_messages = inputs.messages;
369383
}

common/log.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "ggml.h" // for ggml_log_level
44

5+
#define LOG_CLR_TO_EOL "\033[K\r"
56
#define LOG_COL_DEFAULT "\033[0m"
67
#define LOG_COL_BOLD "\033[1m"
78
#define LOG_COL_RED "\033[31m"

common/minja.hpp

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,6 +1385,13 @@ static std::string strip(const std::string & s) {
13851385
return s.substr(start, end - start + 1);
13861386
}
13871387

1388+
static std::string capitalize(const std::string & s) {
1389+
if (s.empty()) return s;
1390+
auto result = s;
1391+
result[0] = std::toupper(result[0]);
1392+
return result;
1393+
}
1394+
13881395
static std::string html_escape(const std::string & s) {
13891396
std::string result;
13901397
result.reserve(s.size());
@@ -1462,6 +1469,9 @@ class MethodCallExpr : public Expression {
14621469
if (method->get_name() == "strip") {
14631470
vargs.expectArgs("strip method", {0, 0}, {0, 0});
14641471
return Value(strip(str));
1472+
} else if (method->get_name() == "capitalize") {
1473+
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
1474+
return Value(capitalize(str));
14651475
} else if (method->get_name() == "endswith") {
14661476
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
14671477
auto suffix = vargs.args[0].get<std::string>();
@@ -1792,7 +1802,7 @@ class Parser {
17921802
auto left = parseStringConcat();
17931803
if (!left) throw std::runtime_error("Expected left side of 'logical compare' expression");
17941804

1795-
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not[\r\n\s]+in\b)");
1805+
static std::regex compare_tok(R"(==|!=|<=?|>=?|in\b|is\b|not\s+in\b)");
17961806
static std::regex not_tok(R"(not\b)");
17971807
std::string op_str;
17981808
while (!(op_str = consumeToken(compare_tok)).empty()) {
@@ -2171,7 +2181,7 @@ class Parser {
21712181
using TemplateTokenIterator = TemplateTokenVector::const_iterator;
21722182

21732183
std::vector<std::string> parseVarNames() {
2174-
static std::regex varnames_regex(R"(((?:\w+)(?:[\r\n\s]*,[\r\n\s]*(?:\w+))*)[\r\n\s]*)");
2184+
static std::regex varnames_regex(R"(((?:\w+)(?:\s*,\s*(?:\w+))*)\s*)");
21752185

21762186
std::vector<std::string> group;
21772187
if ((group = consumeTokenGroups(varnames_regex)).empty()) throw std::runtime_error("Expected variable names");
@@ -2194,13 +2204,13 @@ class Parser {
21942204
}
21952205

21962206
TemplateTokenVector tokenize() {
2197-
static std::regex comment_tok(R"(\{#([-~]?)([\s\S\r\n]*?)([-~]?)#\})");
2207+
static std::regex comment_tok(R"(\{#([-~]?)([\s\S]*?)([-~]?)#\})");
21982208
static std::regex expr_open_regex(R"(\{\{([-~])?)");
2199-
static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
2209+
static std::regex block_open_regex(R"(^\{%([-~])?\s*)");
22002210
static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter|break|continue)\b)");
22012211
static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
2202-
static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
2203-
static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
2212+
static std::regex expr_close_regex(R"(\s*([-~])?\}\})");
2213+
static std::regex block_close_regex(R"(\s*([-~])?%\})");
22042214

22052215
TemplateTokenVector tokens;
22062216
std::vector<std::string> group;
@@ -2284,7 +2294,7 @@ class Parser {
22842294
auto post_space = parseBlockClose();
22852295
tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
22862296
} else if (keyword == "set") {
2287-
static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");
2297+
static std::regex namespaced_var_regex(R"((\w+)\s*\.\s*(\w+))");
22882298

22892299
std::string ns;
22902300
std::vector<std::string> var_names;
@@ -2336,6 +2346,11 @@ class Parser {
23362346
throw std::runtime_error("Unexpected block: " + keyword);
23372347
}
23382348
} else if (std::regex_search(it, end, match, non_text_open_regex)) {
2349+
if (!match.position()) {
2350+
if (match[0] != "{#")
2351+
throw std::runtime_error("Internal error: Expected a comment");
2352+
throw std::runtime_error("Missing end of comment tag");
2353+
}
23392354
auto text_end = it + match.position();
23402355
text = std::string(it, text_end);
23412356
it = text_end;
@@ -2400,7 +2415,7 @@ class Parser {
24002415

24012416
auto text = text_token->text;
24022417
if (post_space == SpaceHandling::Strip) {
2403-
static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
2418+
static std::regex trailing_space_regex(R"(\s+$)");
24042419
text = std::regex_replace(text, trailing_space_regex, "");
24052420
} else if (options.lstrip_blocks && it != end) {
24062421
auto i = text.size();
@@ -2410,7 +2425,7 @@ class Parser {
24102425
}
24112426
}
24122427
if (pre_space == SpaceHandling::Strip) {
2413-
static std::regex leading_space_regex(R"(^(\s|\r|\n)+)");
2428+
static std::regex leading_space_regex(R"(^\s+)");
24142429
text = std::regex_replace(text, leading_space_regex, "");
24152430
} else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
24162431
if (text.length() > 0 && text[0] == '\n') {

common/speculative.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ struct common_speculative_params {
99
int n_draft = 16; // max drafted tokens
1010
int n_reuse = 256;
1111

12-
float p_min = 0.9f; // min probabiliy required to accept a token in the draft
12+
float p_min = 0.9f; // min probability required to accept a token in the draft
1313
};
1414

1515
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);

docs/backend/OPENCL.md

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
# llama.cpp for OpenCL
2+
3+
- [Background](#background)
4+
- [OS](#os)
5+
- [Hardware](#hardware)
6+
- [DataType Supports](#datatype-supports)
7+
- [Model Preparation](#model-preparation)
8+
- [CMake Options](#cmake-options)
9+
- [Android](#android)
10+
- [Windows 11 Arm64](#windows-11-arm64)
11+
- [Known Issue](#known-issues)
12+
- [TODO](#todo)
13+
14+
## Background
15+
16+
OpenCL (Open Computing Language) is an open, royalty-free standard for cross-platform, parallel programming of diverse accelerators found in supercomputers, cloud servers, personal computers, mobile devices and embedded platforms. OpenCL specifies a programming language (based on C99) for programming these devices and application programming interfaces (APIs) to control the platform and execute programs on the compute devices. Similar to CUDA, OpenCL has been widely used to program GPUs and is supported by most GPU vendors.
17+
18+
### Llama.cpp + OpenCL
19+
20+
The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs although the performance is not optimal.
21+
22+
## OS
23+
24+
| OS | Status | Verified |
25+
|---------|---------|------------------------------------------------|
26+
| Android | Support | Snapdragon 8 Gen 3, Snapdragon 8 Elite |
27+
| Windows | Support | Windows 11 Arm64 with Snapdragon X Elite |
28+
| Linux | Support | Ubuntu 22.04 WSL2 with Intel 12700H |
29+
30+
## Hardware
31+
32+
### Adreno GPU
33+
34+
**Verified devices**
35+
36+
| Adreno GPU | Status |
37+
|:------------------------------------:|:-------:|
38+
| Adreno 750 (Snapdragon 8 Gen 3) | Support |
39+
| Adreno 830 (Snapdragon 8 Elite) | Support |
40+
| Adreno X85 (Snapdragon X Elite) | Support |
41+
42+
## DataType Supports
43+
44+
| DataType | Status |
45+
|:----------------------:|:--------------------------:|
46+
| Q4_0 | Support |
47+
| Q6_K | Support, but not optimized |
48+
49+
## Model Preparation
50+
51+
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.
52+
53+
Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,
54+
55+
```sh
56+
./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
57+
```
58+
59+
Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.
60+
61+
## CMake Options
62+
63+
The OpenCL backend has the following CMake options that control the behavior of the backend.
64+
65+
| CMake options | Default value | Description |
66+
|:---------------------------------:|:--------------:|:------------------------------------------|
67+
| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embed OpenCL kernels into the executable. |
68+
| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Use kernels optimized for Adreno. |
69+
70+
## Android
71+
72+
Ubuntu 22.04 is used for targeting Android. Make sure the following tools are accessible from command line,
73+
74+
* Git
75+
* CMake 3.29
76+
* Ninja
77+
* Python3
78+
79+
### I. Setup Environment
80+
81+
1. **Install NDK**
82+
83+
```sh
84+
cd ~
85+
wget https://dl.google.com/android/repository/commandlinetools-linux-8512546_latest.zip && \
86+
unzip commandlinetools-linux-8512546_latest.zip && \
87+
mkdir -p ~/android-sdk/cmdline-tools && \
88+
mv cmdline-tools latest && \
89+
mv latest ~/android-sdk/cmdline-tools/ && \
90+
rm -rf commandlinetools-linux-8512546_latest.zip
91+
92+
yes | ~/android-sdk/cmdline-tools/latest/bin/sdkmanager "ndk;26.3.11579264"
93+
```
94+
95+
2. **Install OpenCL Headers and Library**
96+
97+
```sh
98+
mkdir -p ~/dev/llm
99+
cd ~/dev/llm
100+
101+
git clone https://github.com/KhronosGroup/OpenCL-Headers && \
102+
cd OpenCL-Headers && \
103+
cp -r CL ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
104+
105+
cd ~/dev/llm
106+
107+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
108+
cd OpenCL-ICD-Loader && \
109+
mkdir build_ndk26 && cd build_ndk26 && \
110+
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
111+
-DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
112+
-DOPENCL_ICD_LOADER_HEADERS_DIR=$HOME/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
113+
-DANDROID_ABI=arm64-v8a \
114+
-DANDROID_PLATFORM=24 \
115+
-DANDROID_STL=c++_shared && \
116+
ninja && \
117+
cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
118+
```
119+
120+
### II. Build llama.cpp
121+
122+
```sh
123+
cd ~/dev/llm
124+
125+
git clone https://github.com/ggerganov/llama.cpp && \
126+
cd llama.cpp && \
127+
mkdir build-android && cd build-android
128+
129+
cmake .. -G Ninja \
130+
-DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
131+
-DANDROID_ABI=arm64-v8a \
132+
-DANDROID_PLATFORM=android-28 \
133+
-DBUILD_SHARED_LIBS=OFF \
134+
-DGGML_OPENCL=ON
135+
136+
ninja
137+
```
138+
139+
## Windows 11 Arm64
140+
141+
A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the following tools are accessible from command line,
142+
143+
* Git
144+
* CMake 3.29
145+
* Clang 19
146+
* Ninja
147+
* Visual Studio 2022
148+
149+
Powershell is used for the following instructions.
150+
151+
### I. Setup Environment
152+
153+
1. **Install OpenCL Headers and Library**
154+
155+
```powershell
156+
mkdir -p ~/dev/llm
157+
158+
cd ~/dev/llm
159+
git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
160+
mkdir build && cd build
161+
cmake .. -G Ninja `
162+
-DBUILD_TESTING=OFF `
163+
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
164+
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
165+
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
166+
cmake --build . --target install
167+
168+
cd ~/dev/llm
169+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
170+
mkdir build && cd build
171+
cmake .. -G Ninja `
172+
-DCMAKE_BUILD_TYPE=Release `
173+
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
174+
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
175+
cmake --build . --target install
176+
```
177+
178+
### II. Build llama.cpp
179+
180+
```powershell
181+
182+
mkdir -p ~/dev/llm
183+
cd ~/dev/llm
184+
185+
git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp
186+
mkdir build && cd build
187+
188+
cmake .. -G Ninja `
189+
-DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
190+
-DCMAKE_BUILD_TYPE=Release `
191+
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
192+
-DBUILD_SHARED_LIBS=OFF `
193+
-DGGML_OPENCL=ON
194+
ninja
195+
```
196+
197+
## Known Issues
198+
199+
- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
200+
201+
## TODO
202+
203+
- Fix Qwen2.5 0.5B
204+
- Optimization for Q6_K
205+
- Support and optimization for Q4_K

examples/main/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Once downloaded, place your model in the models folder in llama.cpp.
3737

3838
##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
3939
```bash
40-
./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
40+
./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
4141
```
4242

4343
### Windows:

0 commit comments

Comments
 (0)