Thireus
diff --git a/‎.gitignore‎
Lines changed: 13 additions & 0 deletions b/‎.gitignore‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎common/chat-parser.cpp‎
Lines changed: 125 additions & 13 deletions b/‎common/chat-parser.cpp‎
Lines changed: 125 additions & 13 deletions
diff --git a/‎common/chat.cpp‎
Lines changed: 3 additions & 0 deletions b/‎common/chat.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 58 additions & 2 deletions b/‎common/common.cpp‎
Lines changed: 58 additions & 2 deletions
diff --git a/‎common/common.h‎
Lines changed: 16 additions & 3 deletions b/‎common/common.h‎
Lines changed: 16 additions & 3 deletions
@@ -130,3 +130,16 @@ poetry.toml
 
 # Scripts
 !/scripts/install-oneapi.bat
+/examples/server/webui_llamacpp/.gitignore
+
+# Test models for lora adapters
+/lora-tests
+
+# Local scripts
+/run-vim.sh
+/run-chat.sh
+.ccache/
+
+# IDE
+*.code-workspace
+.windsurf/
@@ -3,9 +3,12 @@
 #include "log.h"
 #include "regex-partial.h"
 
+#include <algorithm>
+#include <cctype>
 #include <optional>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <vector>
 
 using json = nlohmann::ordered_json;
@@ -137,6 +140,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
 }
 
 bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+    std::string pending_reasoning_prefix;
+
+    if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+        return false;
+    }
+
+    auto set_reasoning_prefix = [&](size_t prefix_pos) {
+        if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
+            return;
+        }
+        if (prefix_pos + start_think.size() > input_.size()) {
+            pending_reasoning_prefix.clear();
+            return;
+        }
+        // Capture the exact literal that opened the reasoning section so we can
+        // surface it back to callers. This ensures formats that force the
+        // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
+        // instead of dropping it during parsing.
+        pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
+    };
+
     auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
         auto stripped_reasoning = string_strip(reasoning);
         if (stripped_reasoning.empty()) {
@@ -149,28 +173,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
                 add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
             }
         } else {
+            if (!pending_reasoning_prefix.empty()) {
+                add_reasoning_content(pending_reasoning_prefix);
+                pending_reasoning_prefix.clear();
+            }
             add_reasoning_content(stripped_reasoning);
         }
     };
-    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
-        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
-            if (auto res = try_find_literal(end_think)) {
-                handle_reasoning(res->prelude, /* closed */ true);
-                consume_spaces();
-                return true;
-            }
-            auto rest = consume_rest();
+
+    const size_t saved_pos = pos_;
+    const size_t saved_content_size = result_.content.size();
+    const size_t saved_reasoning_size = result_.reasoning_content.size();
+
+    auto restore_state = [&]() {
+        move_to(saved_pos);
+        result_.content.resize(saved_content_size);
+        result_.reasoning_content.resize(saved_reasoning_size);
+    };
+
+    // Allow leading whitespace to be preserved as content when reasoning is present at the start
+    size_t cursor = pos_;
+    size_t whitespace_end = cursor;
+    while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
+        ++whitespace_end;
+    }
+
+    if (whitespace_end >= input_.size()) {
+        restore_state();
+        if (syntax_.thinking_forced_open) {
+            auto rest = input_.substr(saved_pos);
             if (!rest.empty()) {
                 handle_reasoning(rest, /* closed */ !is_partial());
             }
-            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
-            // if (!syntax_.thinking_forced_open) {
-            //     throw common_chat_msg_partial_exception(end_think);
-            // }
+            move_to(input_.size());
             return true;
         }
+        return false;
+    }
+
+    cursor = whitespace_end;
+    const size_t remaining = input_.size() - cursor;
+    const size_t start_prefix = std::min(start_think.size(), remaining);
+    const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
+
+    if (has_start_tag && start_prefix < start_think.size()) {
+        move_to(input_.size());
+        return true;
+    }
+
+    if (has_start_tag) {
+        if (whitespace_end > pos_) {
+            add_content(input_.substr(pos_, whitespace_end - pos_));
+        }
+        set_reasoning_prefix(cursor);
+        cursor += start_think.size();
+    } else if (syntax_.thinking_forced_open) {
+        cursor = whitespace_end;
+    } else {
+        restore_state();
+        return false;
+    }
+    while (true) {
+        if (cursor >= input_.size()) {
+            move_to(input_.size());
+            return true;
+        }
+
+        size_t end_pos = input_.find(end_think, cursor);
+        if (end_pos == std::string::npos) {
+            std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
+            size_t partial_off = string_find_partial_stop(remaining_view, end_think);
+            size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
+            if (reasoning_end > cursor) {
+                handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
+            }
+            move_to(input_.size());
+            return true;
+        }
+
+        if (end_pos > cursor) {
+            handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+        } else {
+            handle_reasoning("", /* closed */ true);
+        }
+
+        cursor = end_pos + end_think.size();
+
+        while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
+            ++cursor;
+        }
+
+        const size_t next_remaining = input_.size() - cursor;
+        if (next_remaining == 0) {
+            move_to(cursor);
+            return true;
+        }
+
+        const size_t next_prefix = std::min(start_think.size(), next_remaining);
+        if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
+            if (next_prefix < start_think.size()) {
+                move_to(input_.size());
+                return true;
+            }
+            set_reasoning_prefix(cursor);
+            cursor += start_think.size();
+            continue;
+        }
+
+        move_to(cursor);
+        return true;
     }
-    return false;
 }
 
 std::string common_chat_msg_parser::consume_rest() {
 
@@ -1207,6 +1207,8 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
     return data;
 }
 static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
+    builder.try_parse_reasoning("<think>", "</think>");
+
     if (!builder.syntax().parse_tool_calls) {
         builder.add_content(builder.consume_rest());
         return;
@@ -2411,6 +2413,7 @@ common_chat_params common_chat_templates_apply(
 }
 
 static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
     builder.add_content(builder.consume_rest());
 }
 
 
@@ -200,6 +200,23 @@ int32_t cpu_get_num_math() {
     return cpu_get_num_physical_cores();
 }
 
+//
+// Arg utils
+//
+common_webui common_webui_from_name(const std::string& format) {
+    if (format == "none") {
+        return COMMON_WEBUI_NONE;
+    }
+    else if (format == "auto") {
+        return COMMON_WEBUI_AUTO;
+    }
+    else if (format == "llamacpp") {
+        return COMMON_WEBUI_LLAMACPP;
+    }
+    else {
+        return COMMON_WEBUI_AUTO;
+    }
+}
 
 static std::string read_file(const std::string& fname) {
     std::ifstream file(fname);
@@ -210,6 +227,14 @@ static std::string read_file(const std::string& fname) {
     file.close();
     return content;
 }
+
+static std::string parse_device_list(const std::string& value) {
+    if (value==" " || value.find("-")!= std::string::npos) {
+        throw std::invalid_argument("no devices specified");
+    }
+    return value;
+}
+
 //
 // CLI argument parsing
 //
@@ -1052,7 +1077,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
-    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
         CHECK_ARG
         params.n_gpu_layers_draft = std::stoi(argv[i]);
         if (!llama_supports_gpu_offload()) {
@@ -1199,6 +1224,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         else { invalid_param = true; }
         return true;
     }
+    if (arg == "-dev" || arg == "--device") {
+        CHECK_ARG
+        std::string value(argv[i]);
+        params.devices = parse_device_list(value);
+        return true;
+    }
+    if (arg == "-devd" || arg == "--device-draft") {
+        CHECK_ARG
+        std::string value(argv[i]);
+        params.devices_draft = parse_device_list(value);
+        return true;
+    }
     if (arg == "-v" || arg == "--verbose") {
         params.verbosity = 1;
         return true;
@@ -1417,6 +1454,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.public_path = argv[i];
         return true;
     }
+    if (arg == "--webui") {
+        CHECK_ARG
+        params.webui = common_webui_from_name(std::string(argv[i]));
+        return true;
+    }
     if (arg == "--api-key") {
         CHECK_ARG
         params.api_keys.push_back(argv[i]);
@@ -1888,6 +1930,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
                         "- none: leaves thoughts unparsed in `message.content`\n"
                         "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+                        "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
                         "(default: none)", });
     options.push_back({ "main",      "       --chat-template-kwargs JSON",  "sets additional params for the json template parser"});
     options.push_back({ "main",      "       --reasoning-budget N",  "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)" });
@@ -1982,6 +2025,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "  - row: split rows across GPUs" });
         options.push_back({ "*",           "-ts,   --tensor-split SPLIT",
                                                                         "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
+        options.push_back({ "*",           "-dev,   --device dev1,dev2",
+                                                                         "comma-separated list of devices to use for offloading (none = don't offload)\n"
+                                                                         "Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
+        options.push_back({ "*",           "-devd,   --device-draft dev1,dev2",
+                                                                         "comma-separated list of devices to use for offloading for the draft model (none = don't offload)\n"
+                                                                         "Example: CUDA0,CUDA1,RPC[192.168.0.1:8080]\n" });
         options.push_back({ "*",           "-mg,   --main-gpu i",       "the GPU to use for the model (with split-mode = none),\n"
                                                                         "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
     }
@@ -2046,6 +2095,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
     options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
     options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
+    options.push_back({ "server",        "       --webui NAME",
+                                                             "controls which webui to server:\n"
+                                                            "- none: disable webui\n"
+                                                            "- auto: default webui \n"
+                                                            "- llamacpp: llamacpp webui \n"
+                                                            "(default: auto)", });
     options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
     options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
     options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
@@ -2549,7 +2604,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     } else {
         model = llama_load_model_from_file(params.model.c_str(), mparams);
     }
-
+    
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
         return iparams;
@@ -2666,6 +2721,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
 
 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
     auto mparams = llama_model_default_params();
+    mparams.devices = params.devices.c_str(); 
 
     if (params.n_gpu_layers != -1) {
         mparams.n_gpu_layers = params.n_gpu_layers;
 
@@ -109,6 +109,14 @@ enum common_reasoning_format {
     COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 
+enum common_webui {
+    COMMON_WEBUI_NONE,
+    COMMON_WEBUI_AUTO,
+    COMMON_WEBUI_LLAMACPP,  
+};
+
+common_webui common_webui_from_name(const std::string& format);
+
 struct model_paths {
     std::string path        = ""; // model local path                                       // NOLINT
     std::string url         = ""; // model url to download                                  // NOLINT
@@ -118,6 +126,9 @@ struct model_paths {
 };
 
 struct gpt_params {
+    std::string devices;
+    std::string devices_draft;
+
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
     int32_t n_threads             = cpu_get_num_math();
@@ -185,6 +196,7 @@ struct gpt_params {
     std::string logits_file          = ""; // file for saving *all* logits
     std::string rpc_servers          = ""; // comma separated list of RPC servers
 
+
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
@@ -288,7 +300,7 @@ struct gpt_params {
     bool use_jinja = false;                                                                                 // NOLINT
     std::string system_prompt = "";
     bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
     bool prefill_assistant = true;
 
@@ -300,8 +312,8 @@ struct gpt_params {
     std::map<std::string, std::string> default_template_kwargs;
 
     // "advanced" endpoints are disabled by default for better security
-    bool webui            = true;
-    bool endpoint_slots   = false;
+    common_webui webui = COMMON_WEBUI_AUTO;
+    bool endpoint_slots   = true;
     bool endpoint_props   = false; // only control POST requests, not GET
     bool endpoint_metrics = false;
 
@@ -432,6 +444,7 @@ bool fs_create_directory_with_parents(const std::string & path);
 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);
 
+
 //
 // Model utils
 //
Original file line number	Diff line number	Diff line change
`@@ -1207,6 +1207,8 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te`
`1207`	`1207`	`return data;`
`1208`	`1208`	`}`
`1209`	`1209`	`static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {`
	`1210`	`+ builder.try_parse_reasoning("<think>", "</think>");`
	`1211`	`+`
`1210`	`1212`	`if (!builder.syntax().parse_tool_calls) {`
`1211`	`1213`	`builder.add_content(builder.consume_rest());`
`1212`	`1214`	`return;`
`@@ -2411,6 +2413,7 @@ common_chat_params common_chat_templates_apply(`
`2411`	`2413`	`}`
`2412`	`2414`
`2413`	`2415`	`static void common_chat_parse_content_only(common_chat_msg_parser & builder) {`
	`2416`	`+ builder.try_parse_reasoning("<think>", "</think>");`
`2414`	`2417`	`builder.add_content(builder.consume_rest());`
`2415`	`2418`	`}`
`2416`	`2419`