Skip to content

Commit a6cb7c8

Browse files
authored
Merge pull request #9 from ggml-org/master
Merge from upstream
2 parents dec0c8d + d00cbea commit a6cb7c8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+3470
-1818
lines changed

.github/workflows/build.yml

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -444,8 +444,8 @@ jobs:
444444
# This is using llvmpipe and runs slower than other backends
445445
ctest -L main --verbose --timeout 4200
446446
447-
ubuntu-22-cmake-webgpu:
448-
runs-on: ubuntu-22.04
447+
ubuntu-24-cmake-webgpu:
448+
runs-on: ubuntu-24.04
449449

450450
steps:
451451
- name: Clone
@@ -455,16 +455,34 @@ jobs:
455455
- name: ccache
456456
uses: ggml-org/[email protected]
457457
with:
458-
key: ubuntu-22-cmake-webgpu
458+
key: ubuntu-24-cmake-webgpu
459459
evict-old-files: 1d
460460

461-
- name: Vulkan SDK Dependencies
462-
id: vulkan-depends
461+
- name: Dependencies
462+
id: depends
463463
run: |
464-
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
465-
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
464+
sudo add-apt-repository -y ppa:kisak/kisak-mesa
466465
sudo apt-get update -y
467-
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
466+
sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
467+
468+
- name: Get latest Vulkan SDK version
469+
id: vulkan_sdk_version
470+
run: |
471+
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
472+
473+
- name: Use Vulkan SDK Cache
474+
uses: actions/cache@v4
475+
id: cache-sdk
476+
with:
477+
path: ./vulkan_sdk
478+
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
479+
480+
- name: Setup Vulkan SDK
481+
if: steps.cache-sdk.outputs.cache-hit != 'true'
482+
uses: ./.github/actions/linux-setup-vulkan
483+
with:
484+
path: ./vulkan_sdk
485+
version: ${{ env.VULKAN_SDK_VERSION }}
468486

469487
- name: Dawn Dependency
470488
id: dawn-depends
@@ -1497,3 +1515,29 @@ jobs:
14971515
run: |
14981516
vulkaninfo --summary
14991517
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
1518+
1519+
ggml-ci-arm64-cpu-kleidiai:
1520+
runs-on: ubuntu-22.04-arm
1521+
1522+
steps:
1523+
- name: Clone
1524+
id: checkout
1525+
uses: actions/checkout@v4
1526+
1527+
- name: ccache
1528+
uses: ggml-org/[email protected]
1529+
with:
1530+
key: ggml-ci-arm64-cpu-kleidiai
1531+
evict-old-files: 1d
1532+
1533+
- name: Dependencies
1534+
id: depends
1535+
run: |
1536+
sudo apt-get update
1537+
sudo apt-get install -y build-essential libcurl4-openssl-dev
1538+
1539+
- name: Test
1540+
id: ggml-ci
1541+
run: |
1542+
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
1543+

CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
/ggml/src/ggml-rpc/ @rgerganov
7171
/ggml/src/ggml-threading.* @ggerganov @slaren
7272
/ggml/src/ggml-vulkan/ @0cc4m
73+
/ggml/src/ggml-webgpu/ @reeselevine
7374
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
7475
/ggml/src/ggml.c @ggerganov @slaren
7576
/ggml/src/ggml.cpp @ggerganov @slaren

ci/run.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
# # with MUSA support
2323
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
2424
#
25+
# # with KLEIDIAI support
26+
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
27+
#
2528

2629
if [ -z "$2" ]; then
2730
echo "usage: $0 <output-dir> <mnt-dir>"
@@ -115,6 +118,34 @@ if [ ! -z ${GG_BUILD_NO_SVE} ]; then
115118
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
116119
fi
117120

121+
if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
122+
echo ">>===== Enabling KleidiAI support"
123+
124+
CANDIDATES=("armv9-a+dotprod+i8mm" "armv8.6-a+dotprod+i8mm" "armv8.2-a+dotprod")
125+
CPU=""
126+
127+
for cpu in "${CANDIDATES[@]}"; do
128+
if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
129+
CPU="$cpu"
130+
break
131+
fi
132+
done
133+
134+
if [ -z "$CPU" ]; then
135+
echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
136+
exit 1
137+
fi
138+
139+
echo ">>===== Using ARM baseline: ${CPU}"
140+
141+
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
142+
-DGGML_NATIVE=OFF \
143+
-DGGML_CPU_KLEIDIAI=ON \
144+
-DGGML_CPU_AARCH64=ON \
145+
-DGGML_CPU_ARM_ARCH=${CPU} \
146+
-DBUILD_SHARED_LIBS=OFF"
147+
fi
148+
118149
## helpers
119150

120151
# download a file if it does not exist or if it is outdated

common/arg.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1935,6 +1935,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19351935
params.n_ctx_checkpoints = value;
19361936
}
19371937
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
1938+
add_opt(common_arg(
1939+
{"--cache-ram", "-cram"}, "N",
1940+
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
1941+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1942+
[](common_params & params, int value) {
1943+
params.cache_ram_mib = value;
1944+
}
1945+
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
19381946
add_opt(common_arg(
19391947
{"--kv-unified", "-kvu"},
19401948
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -3432,7 +3440,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
34323440
{"--reasoning-format"}, "FORMAT",
34333441
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
34343442
"- none: leaves thoughts unparsed in `message.content`\n"
3435-
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
3443+
"- deepseek: puts thoughts in `message.reasoning_content`\n"
3444+
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
34363445
"(default: auto)",
34373446
[](common_params & params, const std::string & value) {
34383447
params.reasoning_format = common_reasoning_format_from_name(value);

common/chat-parser.cpp

Lines changed: 125 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@
33
#include "log.h"
44
#include "regex-partial.h"
55

6+
#include <algorithm>
7+
#include <cctype>
68
#include <optional>
79
#include <stdexcept>
810
#include <string>
11+
#include <string_view>
912
#include <vector>
1013

1114
using json = nlohmann::ordered_json;
@@ -166,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
166169
}
167170

168171
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
172+
std::string pending_reasoning_prefix;
173+
174+
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
175+
return false;
176+
}
177+
178+
auto set_reasoning_prefix = [&](size_t prefix_pos) {
179+
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
180+
return;
181+
}
182+
if (prefix_pos + start_think.size() > input_.size()) {
183+
pending_reasoning_prefix.clear();
184+
return;
185+
}
186+
// Capture the exact literal that opened the reasoning section so we can
187+
// surface it back to callers. This ensures formats that force the
188+
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
189+
// instead of dropping it during parsing.
190+
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
191+
};
192+
169193
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
170194
auto stripped_reasoning = string_strip(reasoning);
171195
if (stripped_reasoning.empty()) {
@@ -178,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
178202
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
179203
}
180204
} else {
205+
if (!pending_reasoning_prefix.empty()) {
206+
add_reasoning_content(pending_reasoning_prefix);
207+
pending_reasoning_prefix.clear();
208+
}
181209
add_reasoning_content(stripped_reasoning);
182210
}
183211
};
184-
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
185-
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
186-
if (auto res = try_find_literal(end_think)) {
187-
handle_reasoning(res->prelude, /* closed */ true);
188-
consume_spaces();
189-
return true;
190-
}
191-
auto rest = consume_rest();
212+
213+
const size_t saved_pos = pos_;
214+
const size_t saved_content_size = result_.content.size();
215+
const size_t saved_reasoning_size = result_.reasoning_content.size();
216+
217+
auto restore_state = [&]() {
218+
move_to(saved_pos);
219+
result_.content.resize(saved_content_size);
220+
result_.reasoning_content.resize(saved_reasoning_size);
221+
};
222+
223+
// Allow leading whitespace to be preserved as content when reasoning is present at the start
224+
size_t cursor = pos_;
225+
size_t whitespace_end = cursor;
226+
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
227+
++whitespace_end;
228+
}
229+
230+
if (whitespace_end >= input_.size()) {
231+
restore_state();
232+
if (syntax_.thinking_forced_open) {
233+
auto rest = input_.substr(saved_pos);
192234
if (!rest.empty()) {
193235
handle_reasoning(rest, /* closed */ !is_partial());
194236
}
195-
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
196-
// if (!syntax_.thinking_forced_open) {
197-
// throw common_chat_msg_partial_exception(end_think);
198-
// }
237+
move_to(input_.size());
199238
return true;
200239
}
240+
return false;
241+
}
242+
243+
cursor = whitespace_end;
244+
const size_t remaining = input_.size() - cursor;
245+
const size_t start_prefix = std::min(start_think.size(), remaining);
246+
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
247+
248+
if (has_start_tag && start_prefix < start_think.size()) {
249+
move_to(input_.size());
250+
return true;
251+
}
252+
253+
if (has_start_tag) {
254+
if (whitespace_end > pos_) {
255+
add_content(input_.substr(pos_, whitespace_end - pos_));
256+
}
257+
set_reasoning_prefix(cursor);
258+
cursor += start_think.size();
259+
} else if (syntax_.thinking_forced_open) {
260+
cursor = whitespace_end;
261+
} else {
262+
restore_state();
263+
return false;
264+
}
265+
while (true) {
266+
if (cursor >= input_.size()) {
267+
move_to(input_.size());
268+
return true;
269+
}
270+
271+
size_t end_pos = input_.find(end_think, cursor);
272+
if (end_pos == std::string::npos) {
273+
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
274+
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
275+
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
276+
if (reasoning_end > cursor) {
277+
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
278+
}
279+
move_to(input_.size());
280+
return true;
281+
}
282+
283+
if (end_pos > cursor) {
284+
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
285+
} else {
286+
handle_reasoning("", /* closed */ true);
287+
}
288+
289+
cursor = end_pos + end_think.size();
290+
291+
while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
292+
++cursor;
293+
}
294+
295+
const size_t next_remaining = input_.size() - cursor;
296+
if (next_remaining == 0) {
297+
move_to(cursor);
298+
return true;
299+
}
300+
301+
const size_t next_prefix = std::min(start_think.size(), next_remaining);
302+
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
303+
if (next_prefix < start_think.size()) {
304+
move_to(input_.size());
305+
return true;
306+
}
307+
set_reasoning_prefix(cursor);
308+
cursor += start_think.size();
309+
continue;
310+
}
311+
312+
move_to(cursor);
313+
return true;
201314
}
202-
return false;
203315
}
204316

205317
std::string common_chat_msg_parser::consume_rest() {

common/chat.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp
14081408
return data;
14091409
}
14101410
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1411+
builder.try_parse_reasoning("<think>", "</think>");
1412+
14111413
if (!builder.syntax().parse_tool_calls) {
14121414
builder.add_content(builder.consume_rest());
14131415
return;
@@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply(
28622864
}
28632865

28642866
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
2867+
builder.try_parse_reasoning("<think>", "</think>");
28652868
builder.add_content(builder.consume_rest());
28662869
}
28672870

common/chat.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ struct common_chat_msg_content_part {
3333
struct common_chat_msg {
3434
std::string role;
3535
std::string content;
36-
std::vector<common_chat_msg_content_part> content_parts = {};
37-
std::vector<common_chat_tool_call> tool_calls = {};
36+
std::vector<common_chat_msg_content_part> content_parts;
37+
std::vector<common_chat_tool_call> tool_calls;
3838
std::string reasoning_content;
3939
std::string tool_name;
4040
std::string tool_call_id;
@@ -44,7 +44,7 @@ struct common_chat_msg {
4444
bool empty() const {
4545
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
4646
}
47-
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
47+
void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
4848
for (auto i = 0u; i < tool_calls.size(); i++) {
4949
if (ids_cache.size() <= i) {
5050
auto id = tool_calls[i].id;

common/common.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ struct common_params {
378378
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
379379
bool cont_batching = true; // insert new sequences for decoding on-the-fly
380380
bool no_perf = false; // disable performance metrics
381-
bool ctx_shift = false; // context shift on infinite text generation
381+
bool ctx_shift = false; // context shift on infinite text generation
382382
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
383383
bool kv_unified = false; // enable unified KV cache
384384

@@ -425,15 +425,16 @@ struct common_params {
425425
int32_t timeout_write = timeout_read; // http write timeout in seconds
426426
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
427427
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
428-
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
428+
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
429+
int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
429430

430431
std::string hostname = "127.0.0.1";
431432
std::string public_path = ""; // NOLINT
432433
std::string api_prefix = ""; // NOLINT
433434
std::string chat_template = ""; // NOLINT
434435
bool use_jinja = false; // NOLINT
435436
bool enable_chat_template = true;
436-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
437+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
437438
int reasoning_budget = -1;
438439
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
439440

0 commit comments

Comments
 (0)