Skip to content

Commit 5083622

Browse files
Merge pull request #197 from menloresearch/update-dev-from-master-2025-08-07-00-13
Sync master with upstream release b6106
2 parents 3134a28 + 5fd160b commit 5083622

File tree

23 files changed

+993
-89
lines changed

23 files changed

+993
-89
lines changed

.github/workflows/build.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ jobs:
179179
- name: Test
180180
id: cmake_test
181181
run: |
182-
export LLAMA_SET_ROWS=0
183182
cd build
184183
ctest -L main --verbose --timeout 900
185184
@@ -438,7 +437,6 @@ jobs:
438437
- name: Test
439438
id: cmake_test
440439
run: |
441-
export LLAMA_SET_ROWS=0
442440
cd build
443441
# This is using llvmpipe and runs slower than other backends
444442
ctest -L main --verbose --timeout 3600

common/chat-parser.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,15 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
5555
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
5656
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
5757
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
58-
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
58+
std::string arguments = "";
59+
if (tool_call.contains("arguments")) {
60+
if (tool_call.at("arguments").is_object()) {
61+
arguments = tool_call.at("arguments").dump();
62+
} else {
63+
arguments = tool_call.at("arguments");
64+
}
65+
}
66+
5967
return add_tool_call(name, id, arguments);
6068
}
6169

common/chat.cpp

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ const char * common_chat_format_name(common_chat_format format) {
606606
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
607607
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
608608
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
609+
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
609610
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
610611
default:
611612
throw std::runtime_error("Unknown chat format");
@@ -618,6 +619,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
618619
case COMMON_REASONING_FORMAT_AUTO: return "auto";
619620
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
620621
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
622+
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
621623
default:
622624
throw std::runtime_error("Unknown reasoning format");
623625
}
@@ -1734,6 +1736,124 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
17341736
builder.add_content(builder.consume_rest());
17351737
}
17361738

1739+
static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
1740+
common_chat_params data;
1741+
1742+
// Pass thinking context for Granite template
1743+
json additional_context = {
1744+
{"thinking", inputs.enable_thinking},
1745+
};
1746+
1747+
data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
1748+
data.format = COMMON_CHAT_FORMAT_GRANITE;
1749+
1750+
if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
1751+
if (!inputs.enable_thinking) {
1752+
data.prompt += "</think>";
1753+
} else {
1754+
data.thinking_forced_open = true;
1755+
}
1756+
}
1757+
1758+
if (!inputs.tools.is_null()) {
1759+
// Granite uses <|tool_call|> followed by JSON list
1760+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1761+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1762+
std::vector<std::string> tool_rules;
1763+
foreach_function(inputs.tools, [&](const json & tool) {
1764+
const auto & function = tool.at("function");
1765+
std::string name = function.at("name");
1766+
auto parameters = function.at("parameters");
1767+
builder.resolve_refs(parameters);
1768+
tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
1769+
"-args", {
1770+
{"type", "object"},
1771+
{"properties", {
1772+
{"name", {{"const", name}}},
1773+
{"arguments", parameters},
1774+
}},
1775+
{"required", json::array({"name", "arguments"})},
1776+
})));
1777+
});
1778+
1779+
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
1780+
auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
1781+
1782+
if (data.thinking_forced_open) {
1783+
builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
1784+
} else {
1785+
builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
1786+
}
1787+
1788+
data.grammar_triggers.push_back({
1789+
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1790+
"<|tool_call|>"
1791+
});
1792+
1793+
data.preserved_tokens = {
1794+
"<think>",
1795+
"</think>",
1796+
"<response>",
1797+
"</response>",
1798+
"<|tool_call|>",
1799+
};
1800+
});
1801+
} else {
1802+
// Handle thinking tags for non-tool responses
1803+
if (data.thinking_forced_open && inputs.enable_thinking) {
1804+
data.grammar_lazy = false;
1805+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1806+
builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
1807+
});
1808+
data.preserved_tokens = {
1809+
"<think>",
1810+
"</think>",
1811+
"<response>",
1812+
"</response>",
1813+
};
1814+
}
1815+
}
1816+
1817+
return data;
1818+
}
1819+
1820+
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
1821+
// Parse thinking tags
1822+
builder.try_parse_reasoning("<think>", "</think>");
1823+
1824+
// Parse response tags using regex
1825+
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
1826+
if (auto res = builder.try_find_regex(response_regex)) {
1827+
// Extract the content between the tags (capture group 1)
1828+
auto content = builder.str(res->groups[1]);
1829+
builder.add_content(content);
1830+
builder.move_to(res->groups[0].end);
1831+
}
1832+
1833+
if (!builder.syntax().parse_tool_calls) {
1834+
builder.add_content(builder.consume_rest());
1835+
return;
1836+
}
1837+
1838+
// Look for tool calls
1839+
static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
1840+
if (auto res = builder.try_find_regex(tool_call_regex)) {
1841+
builder.move_to(res->groups[0].end);
1842+
1843+
// Expect JSON array of tool calls
1844+
auto tool_calls_data = builder.consume_json();
1845+
if (tool_calls_data.json.is_array()) {
1846+
if (!builder.add_tool_calls(tool_calls_data.json)) {
1847+
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
1848+
}
1849+
} else {
1850+
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
1851+
}
1852+
} else {
1853+
builder.add_content(builder.consume_rest());
1854+
}
1855+
}
1856+
17371857
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
17381858
common_chat_params data;
17391859
data.prompt = apply(tmpl, inputs);
@@ -1805,6 +1925,11 @@ static common_chat_params common_chat_templates_apply_jinja(
18051925
return common_chat_params_init_command_r7b(tmpl, params);
18061926
}
18071927

1928+
// Granite (IBM) - detects thinking / tools support
1929+
if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
1930+
return common_chat_params_init_granite(tmpl, params);
1931+
}
1932+
18081933
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
18091934
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
18101935
return common_chat_params_init_hermes_2_pro(tmpl, params);
@@ -1865,6 +1990,7 @@ static common_chat_params common_chat_templates_apply_legacy(
18651990
int alloc_size = 0;
18661991
std::vector<llama_chat_message> chat;
18671992
std::vector<std::string> contents;
1993+
18681994
for (const auto & msg : inputs.messages) {
18691995
auto content = msg.content;
18701996
for (const auto & part : msg.content_parts) {
@@ -1966,6 +2092,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
19662092
case COMMON_CHAT_FORMAT_COMMAND_R7B:
19672093
common_chat_parse_command_r7b(builder);
19682094
break;
2095+
case COMMON_CHAT_FORMAT_GRANITE:
2096+
common_chat_parse_granite(builder);
2097+
break;
19692098
case COMMON_CHAT_FORMAT_GPT_OSS:
19702099
common_chat_parse_gpt_oss(builder);
19712100
break;

common/chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ enum common_chat_format {
109109
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
110110
COMMON_CHAT_FORMAT_HERMES_2_PRO,
111111
COMMON_CHAT_FORMAT_COMMAND_R7B,
112+
COMMON_CHAT_FORMAT_GRANITE,
112113
COMMON_CHAT_FORMAT_GPT_OSS,
113114

114115
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ enum common_reasoning_format {
239239
COMMON_REASONING_FORMAT_AUTO,
240240
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
241241
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
242+
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
242243
};
243244

244245
struct common_params {

ggml/src/ggml-backend.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,6 +1071,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
10711071
}
10721072
}
10731073
}
1074+
// if the node is still unassigned, assign it to the first backend that supports it
1075+
for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
1076+
ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
1077+
}
1078+
GGML_ASSERT(*cur_backend_id != -1);
10741079
}
10751080

10761081
// pass 5: split graph, find tensors that need to be copied
@@ -1098,7 +1103,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
10981103

10991104
const int node_backend_id = tensor_backend_id(node);
11001105

1101-
assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
1106+
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
11021107

11031108
// check if we should start a new split based on the sources of the current node
11041109
bool need_new_split = false;
@@ -1156,7 +1161,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
11561161

11571162
size_t src_id = hash_id(src);
11581163
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1159-
assert(src_backend_id != -1); // all inputs should be assigned by now
1164+
GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
11601165

11611166
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
11621167
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {

ggml/src/ggml-cann/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
3131
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
3232
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
3333
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
34+
option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
35+
36+
if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
37+
message(FATAL_ERROR
38+
"CANN Graph (ACL graph mode) is not supported on 310P devices. "
39+
"Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
40+
endif()
3441

3542
if (CANN_INSTALL_DIR)
3643
# Only Support Linux.
@@ -68,6 +75,13 @@ if (CANN_INSTALL_DIR)
6875

6976
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
7077

78+
if (USE_ACL_GRAPH)
79+
target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
80+
message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
81+
else()
82+
message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
83+
endif()
84+
7185
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
7286
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
7387
else()

ggml/src/ggml-cann/common.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,29 @@ class cann_task_queue {
337337
int32_t device_;
338338
};
339339

340+
#ifdef USE_ACL_GRAPH
341+
struct ggml_graph_node_properties {
342+
void * node_address;
343+
ggml_op node_op;
344+
int64_t ne[GGML_MAX_DIMS];
345+
size_t nb[GGML_MAX_DIMS];
346+
void * src_address[GGML_MAX_SRC];
347+
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
348+
};
349+
350+
struct ggml_cann_graph {
351+
~ggml_cann_graph() {
352+
if (graph != nullptr) {
353+
aclmdlRIDestroy(graph);
354+
}
355+
}
356+
357+
aclmdlRI graph = nullptr;
358+
359+
std::vector<ggml_graph_node_properties> ggml_graph_properties;
360+
};
361+
#endif // USE_ACL_GRAPH
362+
340363
/**
341364
* @brief Context for managing CANN backend operations.
342365
*/
@@ -345,8 +368,13 @@ struct ggml_backend_cann_context {
345368
std::string name; /**< Name of the device. */
346369
std::string description; /**< Description of the device. */
347370
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
371+
#ifdef USE_ACL_GRAPH
372+
/// Cached CANN ACL graph used for executing the current ggml computation graph.
373+
std::unique_ptr<ggml_cann_graph> cann_graph;
374+
#endif
348375
cann_task_queue task_queue;
349376
bool async_mode;
377+
bool support_set_rows;
350378

351379
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
352380

@@ -362,6 +390,14 @@ struct ggml_backend_cann_context {
362390
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
363391
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
364392
device, async_mode ? "ON" : "OFF");
393+
394+
support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
395+
GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
396+
397+
if (!support_set_rows) {
398+
GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
399+
"Falling back to eager mode.\n", __func__);
400+
}
365401
}
366402

367403
/**

0 commit comments

Comments
 (0)