LostRuins
diff --git a/‎common/arg.cpp‎
Lines changed: 266 additions & 202 deletions b/‎common/arg.cpp‎
Lines changed: 266 additions & 202 deletions
diff --git a/‎common/chat.cpp‎
Lines changed: 27 additions & 15 deletions b/‎common/chat.cpp‎
Lines changed: 27 additions & 15 deletions
diff --git a/‎ggml/src/ggml-backend-impl.h‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-backend-impl.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-backend.cpp‎
Lines changed: 4 additions & 4 deletions b/‎ggml/src/ggml-backend.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ggml/src/ggml-blas/ggml-blas.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-blas/ggml-blas.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/common.h‎
Lines changed: 14 additions & 0 deletions b/‎ggml/src/ggml-cpu/common.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cpu/ggml-cpu.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -1741,10 +1741,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
     LOG_DBG("%s\n", __func__);
     common_chat_params data;
-    data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
+    const std::optional<json> tools_override = json();
+    const std::optional<json> additional_context = json {
         {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
         {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
-    });
+    };
+    data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
     if (inputs.tools.is_array() && !inputs.tools.empty()) {
         data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
         data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2230,15 +2232,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
 
 static void common_chat_parse_granite(common_chat_msg_parser & builder) {
     // Parse thinking tags
+    static const common_regex start_think_regex(regex_escape("<think>"));
+    static const common_regex end_think_regex(regex_escape("</think>"));
+    // Granite models output partial tokens such as "<" and "<think".
+    // By leveraging try_consume_regex()/try_find_regex() throwing
+    // common_chat_msg_partial_exception for these partial tokens,
+    // processing is interrupted and the tokens are not passed to add_content().
+    if (auto res = builder.try_consume_regex(start_think_regex)) {
+        // Restore position for try_parse_reasoning()
+        builder.move_to(res->groups[0].begin);
+        builder.try_find_regex(end_think_regex, std::string::npos, false);
+        // Restore position for try_parse_reasoning()
+        builder.move_to(res->groups[0].begin);
+    }
     builder.try_parse_reasoning("<think>", "</think>");
 
-    // Parse response tags using regex
-    static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
-    if (auto res = builder.try_find_regex(response_regex)) {
-        // Extract the content between the tags (capture group 1)
-        auto content = builder.str(res->groups[1]);
-        builder.add_content(content);
-        builder.move_to(res->groups[0].end);
+    // Parse response tags
+    static const common_regex start_response_regex(regex_escape("<response>"));
+    static const common_regex end_response_regex(regex_escape("</response>"));
+    // Granite models output partial tokens such as "<" and "<response".
+    // Same hack as reasoning parsing.
+    if (builder.try_consume_regex(start_response_regex)) {
+        builder.try_find_regex(end_response_regex);
     }
 
     if (!builder.syntax().parse_tool_calls) {
@@ -2252,13 +2267,10 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
         builder.move_to(res->groups[0].end);
 
         // Expect JSON array of tool calls
-        auto tool_calls_data = builder.consume_json();
-        if (tool_calls_data.json.is_array()) {
-            if (!builder.add_tool_calls(tool_calls_data.json)) {
-                builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
+        if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
+            if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
+                throw common_chat_msg_partial_exception("incomplete tool call");
             }
-        } else {
-            builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
         }
     } else {
         builder.add_content(builder.consume_rest());
 
@@ -116,7 +116,7 @@ extern "C" {
         void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
 
         // (optional) sort/optimize the nodes in the graph
-        void                      (*optimize_graph)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*graph_optimize)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     };
 
     struct ggml_backend {
 
@@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
     backend->iface.event_wait(backend, event);
 }
 
-static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     GGML_ASSERT(backend);
-    if (backend->iface.optimize_graph != NULL) {
-        backend->iface.optimize_graph(backend, cgraph);
+    if (backend->iface.graph_optimize != NULL) {
+        backend->iface.graph_optimize(backend, cgraph);
     }
 }
 
@@ -1313,7 +1313,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
 
         // Optimize this split of the graph. This needs to happen before we make graph_copy,
         // so they are in sync.
-        ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
+        ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
 
         // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
         for (int j = 0; j < split->n_inputs; j++) {
 
@@ -270,7 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
     /* .graph_compute           = */ ggml_backend_blas_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
-    /* .optimize_graph          = */ NULL,
+    /* .graph_optimize          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_blas_guid(void) {
 
@@ -28,6 +28,14 @@ static inline float bf16_to_f32(ggml_bf16_t x) {
     return GGML_BF16_TO_FP32(x);
 }
 
+static inline float i32_to_f32(int32_t x) {
+    return x;
+}
+
+static inline int32_t f32_to_i32(float x) {
+    return x;
+}
+
 static inline float f32_to_f32(float x) {
     return x;
 }
@@ -54,6 +62,12 @@ struct type_conversion_table<ggml_bf16_t> {
     static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
 };
 
+template <>
+struct type_conversion_table<int32_t> {
+    static constexpr float (*to_f32)(int32_t) = i32_to_f32;
+    static constexpr int32_t (*from_f32)(float) = f32_to_i32;
+};
+
 static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
     const int64_t ith = params->ith;
     const int64_t nth = params->nth;
 
@@ -190,7 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
     /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
-    /* .optimize_graph          = */ NULL,
+    /* .graph_optimize          = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_cpu_guid(void) {
Original file line number	Diff line number	Diff line change
`@@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)`
`463`	`463`	`backend->iface.event_wait(backend, event);`
`464`	`464`	`}`
`465`	`465`
`466`		`-static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {`
	`466`	`+static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {`
`467`	`467`	`GGML_ASSERT(backend);`
`468`		`- if (backend->iface.optimize_graph != NULL) {`
`469`		`- backend->iface.optimize_graph(backend, cgraph);`
	`468`	`+ if (backend->iface.graph_optimize != NULL) {`
	`469`	`+ backend->iface.graph_optimize(backend, cgraph);`
`470`	`470`	`}`
`471`	`471`	`}`
`472`	`472`
`@@ -1313,7 +1313,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra`
`1313`	`1313`
`1314`	`1314`	`// Optimize this split of the graph. This needs to happen before we make graph_copy,`
`1315`	`1315`	`// so they are in sync.`
`1316`		`- ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);`
	`1316`	`+ ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);`
`1317`	`1317`
`1318`	`1318`	`// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split`
`1319`	`1319`	`for (int j = 0; j < split->n_inputs; j++) {`