Skip to content

Commit 0dc6b9f

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/amx/amx.cpp # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl # ggml/src/ggml-zdnn/ggml-zdnn.cpp # scripts/sync-ggml.last # tests/test-backend-ops.cpp # tests/test-chat.cpp # tools/llama-bench/README.md # tools/llama-bench/llama-bench.cpp
2 parents 326f6f3 + 7f76692 commit 0dc6b9f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+3918
-1913
lines changed

common/arg.cpp

Lines changed: 266 additions & 202 deletions
Large diffs are not rendered by default.

common/chat.cpp

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1741,10 +1741,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
17411741
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
17421742
LOG_DBG("%s\n", __func__);
17431743
common_chat_params data;
1744-
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
1744+
const std::optional<json> tools_override = json();
1745+
const std::optional<json> additional_context = json {
17451746
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
17461747
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
1747-
});
1748+
};
1749+
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
17481750
if (inputs.tools.is_array() && !inputs.tools.empty()) {
17491751
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
17501752
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2230,15 +2232,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
22302232

22312233
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
22322234
// Parse thinking tags
2235+
static const common_regex start_think_regex(regex_escape("<think>"));
2236+
static const common_regex end_think_regex(regex_escape("</think>"));
2237+
// Granite models output partial tokens such as "<" and "<think".
2238+
// By leveraging try_consume_regex()/try_find_regex() throwing
2239+
// common_chat_msg_partial_exception for these partial tokens,
2240+
// processing is interrupted and the tokens are not passed to add_content().
2241+
if (auto res = builder.try_consume_regex(start_think_regex)) {
2242+
// Restore position for try_parse_reasoning()
2243+
builder.move_to(res->groups[0].begin);
2244+
builder.try_find_regex(end_think_regex, std::string::npos, false);
2245+
// Restore position for try_parse_reasoning()
2246+
builder.move_to(res->groups[0].begin);
2247+
}
22332248
builder.try_parse_reasoning("<think>", "</think>");
22342249

2235-
// Parse response tags using regex
2236-
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
2237-
if (auto res = builder.try_find_regex(response_regex)) {
2238-
// Extract the content between the tags (capture group 1)
2239-
auto content = builder.str(res->groups[1]);
2240-
builder.add_content(content);
2241-
builder.move_to(res->groups[0].end);
2250+
// Parse response tags
2251+
static const common_regex start_response_regex(regex_escape("<response>"));
2252+
static const common_regex end_response_regex(regex_escape("</response>"));
2253+
// Granite models output partial tokens such as "<" and "<response".
2254+
// Same hack as reasoning parsing.
2255+
if (builder.try_consume_regex(start_response_regex)) {
2256+
builder.try_find_regex(end_response_regex);
22422257
}
22432258

22442259
if (!builder.syntax().parse_tool_calls) {
@@ -2252,13 +2267,10 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
22522267
builder.move_to(res->groups[0].end);
22532268

22542269
// Expect JSON array of tool calls
2255-
auto tool_calls_data = builder.consume_json();
2256-
if (tool_calls_data.json.is_array()) {
2257-
if (!builder.add_tool_calls(tool_calls_data.json)) {
2258-
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2270+
if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
2271+
if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
2272+
throw common_chat_msg_partial_exception("incomplete tool call");
22592273
}
2260-
} else {
2261-
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
22622274
}
22632275
} else {
22642276
builder.add_content(builder.consume_rest());

ggml/src/ggml-backend-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ extern "C" {
116116
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
117117

118118
// (optional) sort/optimize the nodes in the graph
119-
void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
119+
void (*graph_optimize) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
120120
};
121121

122122
struct ggml_backend {

ggml/src/ggml-backend.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
463463
backend->iface.event_wait(backend, event);
464464
}
465465

466-
static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
466+
static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
467467
GGML_ASSERT(backend);
468-
if (backend->iface.optimize_graph != NULL) {
469-
backend->iface.optimize_graph(backend, cgraph);
468+
if (backend->iface.graph_optimize != NULL) {
469+
backend->iface.graph_optimize(backend, cgraph);
470470
}
471471
}
472472

@@ -1313,7 +1313,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
13131313

13141314
// Optimize this split of the graph. This needs to happen before we make graph_copy,
13151315
// so they are in sync.
1316-
ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
1316+
ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
13171317

13181318
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
13191319
for (int j = 0; j < split->n_inputs; j++) {

ggml/src/ggml-blas/ggml-blas.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
270270
/* .graph_compute = */ ggml_backend_blas_graph_compute,
271271
/* .event_record = */ NULL,
272272
/* .event_wait = */ NULL,
273-
/* .optimize_graph = */ NULL,
273+
/* .graph_optimize = */ NULL,
274274
};
275275

276276
static ggml_guid_t ggml_backend_blas_guid(void) {

ggml/src/ggml-cpu/common.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
2828
return GGML_BF16_TO_FP32(x);
2929
}
3030

31+
static inline float i32_to_f32(int32_t x) {
32+
return x;
33+
}
34+
35+
static inline int32_t f32_to_i32(float x) {
36+
return x;
37+
}
38+
3139
static inline float f32_to_f32(float x) {
3240
return x;
3341
}
@@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
5462
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
5563
};
5664

65+
template <>
66+
struct type_conversion_table<int32_t> {
67+
static constexpr float (*to_f32)(int32_t) = i32_to_f32;
68+
static constexpr int32_t (*from_f32)(float) = f32_to_i32;
69+
};
70+
5771
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
5872
const int64_t ith = params->ith;
5973
const int64_t nth = params->nth;

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
190190
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
191191
/* .event_record = */ NULL,
192192
/* .event_wait = */ NULL,
193-
/* .optimize_graph = */ NULL,
193+
/* .graph_optimize = */ NULL,
194194
};
195195

196196
static ggml_guid_t ggml_backend_cpu_guid(void) {

0 commit comments

Comments
 (0)