Nexesenex
diff --git a/‎common/chat-parser.cpp‎
Lines changed: 9 additions & 1 deletion b/‎common/chat-parser.cpp‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎common/chat.cpp‎
Lines changed: 129 additions & 0 deletions b/‎common/chat.cpp‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎common/chat.h‎
Lines changed: 1 addition & 0 deletions b/‎common/chat.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 0 deletions b/‎common/common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-backend.cpp‎
Lines changed: 7 additions & 2 deletions b/‎ggml/src/ggml-backend.cpp‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.cpp‎
Lines changed: 17 additions & 20 deletions b/‎ggml/src/ggml-cpu/ggml-cpu.cpp‎
Lines changed: 17 additions & 20 deletions
diff --git a/‎ggml/src/ggml-cpu/traits.cpp‎
Lines changed: 2 additions & 2 deletions b/‎ggml/src/ggml-cpu/traits.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cpu/traits.h‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cpu/traits.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 10 additions & 2 deletions b/‎ggml/src/ggml-cuda/common.cuh‎
Lines changed: 10 additions & 2 deletions
@@ -55,7 +55,15 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
 bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
     std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
     std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
-    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
+    std::string arguments = "";
+    if (tool_call.contains("arguments")) {
+        if (tool_call.at("arguments").is_object()) {
+            arguments = tool_call.at("arguments").dump();
+        } else {
+            arguments = tool_call.at("arguments");
+        }
+    }
+
     return add_tool_call(name, id, arguments);
 }
 
 
@@ -606,6 +606,7 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
         case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
         case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
+        case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
         case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
         default:
             throw std::runtime_error("Unknown chat format");
@@ -618,6 +619,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
         case COMMON_REASONING_FORMAT_AUTO:     return "auto";
         case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
         case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
+        case COMMON_REASONING_FORMAT_GRANITE: return "granite";
         default:
             throw std::runtime_error("Unknown reasoning format");
     }
@@ -1734,6 +1736,124 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
     builder.add_content(builder.consume_rest());
 }
 
+static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+
+    // Pass thinking context for Granite template
+    json additional_context = {
+        {"thinking", inputs.enable_thinking},
+    };
+
+    data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
+    data.format = COMMON_CHAT_FORMAT_GRANITE;
+
+    if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+
+    if (!inputs.tools.is_null()) {
+        // Granite uses <|tool_call|> followed by JSON list
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            std::vector<std::string> tool_rules;
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+                tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
+"-args", {
+                    {"type", "object"},
+                    {"properties", {
+                        {"name", {{"const", name}}},
+                        {"arguments", parameters},
+                    }},
+                    {"required", json::array({"name", "arguments"})},
+                })));
+            });
+
+            auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
+            auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
+
+            if (data.thinking_forced_open) {
+                builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
+            } else {
+                builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
+            }
+
+            data.grammar_triggers.push_back({
+                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+                "<|tool_call|>"
+            });
+
+            data.preserved_tokens = {
+                "<think>",
+                "</think>",
+                "<response>",
+                "</response>",
+                "<|tool_call|>",
+            };
+        });
+    } else {
+        // Handle thinking tags for non-tool responses
+        if (data.thinking_forced_open && inputs.enable_thinking) {
+            data.grammar_lazy = false;
+            data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+                builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
+            });
+            data.preserved_tokens = {
+                "<think>",
+                "</think>",
+                "<response>",
+                "</response>",
+            };
+        }
+    }
+
+    return data;
+}
+
+static void common_chat_parse_granite(common_chat_msg_parser & builder) {
+    // Parse thinking tags
+    builder.try_parse_reasoning("<think>", "</think>");
+
+    // Parse response tags using regex
+    static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
+    if (auto res = builder.try_find_regex(response_regex)) {
+        // Extract the content between the tags (capture group 1)
+        auto content = builder.str(res->groups[1]);
+        builder.add_content(content);
+        builder.move_to(res->groups[0].end);
+    }
+
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    // Look for tool calls
+    static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
+    if (auto res = builder.try_find_regex(tool_call_regex)) {
+        builder.move_to(res->groups[0].end);
+
+        // Expect JSON array of tool calls
+        auto tool_calls_data = builder.consume_json();
+        if (tool_calls_data.json.is_array()) {
+            if (!builder.add_tool_calls(tool_calls_data.json)) {
+                builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
+            }
+        } else {
+            builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
+        }
+    } else {
+        builder.add_content(builder.consume_rest());
+    }
+}
+
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     data.prompt = apply(tmpl, inputs);
@@ -1805,6 +1925,11 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_command_r7b(tmpl, params);
     }
 
+    // Granite (IBM) - detects thinking / tools support
+    if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
+        return common_chat_params_init_granite(tmpl, params);
+    }
+
     // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
     if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
         return common_chat_params_init_hermes_2_pro(tmpl, params);
@@ -1865,6 +1990,7 @@ static common_chat_params common_chat_templates_apply_legacy(
     int alloc_size = 0;
     std::vector<llama_chat_message> chat;
     std::vector<std::string> contents;
+
     for (const auto & msg : inputs.messages) {
         auto content = msg.content;
         for (const auto & part : msg.content_parts) {
@@ -1966,6 +2092,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_COMMAND_R7B:
             common_chat_parse_command_r7b(builder);
             break;
+        case COMMON_CHAT_FORMAT_GRANITE:
+            common_chat_parse_granite(builder);
+            break;
         case COMMON_CHAT_FORMAT_GPT_OSS:
             common_chat_parse_gpt_oss(builder);
             break;
 
@@ -109,6 +109,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_GRANITE,
     COMMON_CHAT_FORMAT_GPT_OSS,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 
@@ -235,6 +235,7 @@ enum common_reasoning_format {
     COMMON_REASONING_FORMAT_AUTO,
     COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
     COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
+    COMMON_REASONING_FORMAT_GRANITE,         // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 
 struct common_params {
 
@@ -1077,6 +1077,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 }
             }
         }
+        // if the node is still unassigned, assign it to the first backend that supports it
+        for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
+            ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
+        }
+        GGML_ASSERT(*cur_backend_id != -1);
     }
 
     // pass 5: split graph, find tensors that need to be copied
@@ -1104,7 +1109,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
             const int node_backend_id = tensor_backend_id(node);
 
-            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
+            GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
 
             // check if we should start a new split based on the sources of the current node
             bool need_new_split = false;
@@ -1162,7 +1167,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
                 size_t src_id = hash_id(src);
                 const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
-                assert(src_backend_id != -1); // all inputs should be assigned by now
+                GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
 
                 if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
                     if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
 
@@ -35,7 +35,7 @@
 
 // ggml-backend interface
 
-std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
     static std::vector<ggml_backend_buffer_type_t> bufts = []() {
         std::vector<ggml_backend_buffer_type_t> bufts;
 
@@ -57,23 +57,27 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
         }
 #endif
 
-        bufts.push_back(NULL);
-
         return bufts;
     }();
 
     return bufts;
 }
 
 static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
-    return ggml_backend_cpu_get_extra_buffers_type().data();
+    static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
+        std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
+        bufts.push_back(nullptr);
+        return bufts;
+    }();
+
+    return extra_bufts.data();
 
     GGML_UNUSED(device);
 }
 
 static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
-    for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra && extra == buft) {
+    for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
+        if (extra == buft) {
             return true;
         }
     }
@@ -397,20 +401,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
         return true;
     }
 
-    // extra_buffer_op?
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
-        if (extra) {
-            auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
-            if (buf_extra && buf_extra->supports_op(dev, op)) {
-                return true;
-            }
-        }
-    }
-
-    // the other case need host buffer.
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
-            return false;
+    // check extra buffer types
+    // note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
+    for (int i = 0; i < 4; i++) {
+        if (op->src[i] && op->src[i]->buffer &&
+            ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
+            auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
+            return buf_extra->supports_op(dev, op);
         }
     }
 
 
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
 }  // namespace ggml::cpu
 
 bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
         if (extra && extra->context) {
             auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
             auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
 }
 
 bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
-    for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
+    for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
         if (extra && extra->context) {
             auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
             auto tensor_traits = buf_extra->get_tensor_traits(op);
 
@@ -33,6 +33,6 @@ class extra_buffer_type {
 }  // namespace ggml::cpu
 
 // implemented in ggml-cpu.cpp.
-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
 
 #endif
@@ -237,9 +237,13 @@ typedef float2 dfloat2;
 #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
 
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
-#define NEW_MMA_AVAILABLE
+#define TURING_MMA_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#define AMPERE_MMA_AVAILABLE
+#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 #define CP_ASYNC_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
@@ -307,10 +311,14 @@ static bool amd_mfma_available(const int cc) {
 }
 
 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
-static bool new_mma_available(const int cc) {
+static bool turing_mma_available(const int cc) {
     return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 }
 
+static bool ampere_mma_available(const int cc) {
+    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
+}
+
 static bool cp_async_available(const int cc) {
     return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
 }
Original file line number	Diff line number	Diff line change
`@@ -1077,6 +1077,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg`
`1077`	`1077`	`}`
`1078`	`1078`	`}`
`1079`	`1079`	`}`
	`1080`	`+ // if the node is still unassigned, assign it to the first backend that supports it`
	`1081`	`+ for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {`
	`1082`	`+ ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);`
	`1083`	`+ }`
	`1084`	`+ GGML_ASSERT(*cur_backend_id != -1);`
`1080`	`1085`	`}`
`1081`	`1086`
`1082`	`1087`	`// pass 5: split graph, find tensors that need to be copied`
`@@ -1104,7 +1109,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg`
`1104`	`1109`
`1105`	`1110`	`const int node_backend_id = tensor_backend_id(node);`
`1106`	`1111`
`1107`		`- assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback`
	`1112`	`+ GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback`
`1108`	`1113`
`1109`	`1114`	`// check if we should start a new split based on the sources of the current node`
`1110`	`1115`	`bool need_new_split = false;`
`@@ -1162,7 +1167,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg`
`1162`	`1167`
`1163`	`1168`	`size_t src_id = hash_id(src);`
`1164`	`1169`	`const int src_backend_id = sched->hv_tensor_backend_ids[src_id];`
`1165`		`- assert(src_backend_id != -1); // all inputs should be assigned by now`
	`1170`	`+ GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now`
`1166`	`1171`
`1167`	`1172`	`if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {`
`1168`	`1173`	`if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {`