ngxson
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎common/chat-template.hpp‎
Lines changed: 28 additions & 9 deletions b/‎common/chat-template.hpp‎
Lines changed: 28 additions & 9 deletions
diff --git a/‎common/minja.hpp‎
Lines changed: 26 additions & 2 deletions b/‎common/minja.hpp‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎examples/main/README.md‎
Lines changed: 2 additions & 2 deletions b/‎examples/main/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/run/run.cpp‎
Lines changed: 13 additions & 9 deletions b/‎examples/run/run.cpp‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎examples/server/server.cpp‎
Lines changed: 38 additions & 8 deletions b/‎examples/server/server.cpp‎
Lines changed: 38 additions & 8 deletions
@@ -16,7 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ## Hot topics
 
-- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
+- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
+- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
+- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 
 
@@ -25,21 +25,22 @@ class chat_template {
     // Meta-Llama-3.1-8B-Instruct's template expects arguments to be an object.
     // Most other templates (and OpenAI's API) expect the arguments object to be stringified.
     bool requires_object_arguments_ = false;
+    bool requires_typed_content_ = false;
     bool supports_system_role_ = true;
     bool supports_parallel_tool_calls_ = false;
     std::string source_;
     std::string bos_token_;
     std::string eos_token_;
     std::shared_ptr<minja::TemplateNode> template_root_;
 
-    std::string try_render(
+    std::string try_raw_render(
         const nlohmann::ordered_json & messages,
         const nlohmann::ordered_json & tools,
         bool add_generation_prompt,
         const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
     {
         try {
-            auto prompt = apply(messages, tools, add_generation_prompt, extra_context);
+            auto prompt = apply(messages, tools, add_generation_prompt, extra_context, /* adjust_inputs= */ false);
             // fprintf(stderr, "Prompt: %s\n", prompt.c_str());
             return prompt;
         } catch (const std::exception & e) {
@@ -60,7 +61,7 @@ class chat_template {
         supports_tools_ = source.find("tools") != std::string::npos;
 
         auto renders_string_arguments =
-            try_render({
+            try_raw_render({
                 {
                     {"role", "user"},
                     {"content", "Hey"}
@@ -81,7 +82,7 @@ class chat_template {
             }, {}, false).find("{\"code\": \"print") != std::string::npos;
         if (!renders_string_arguments) {
             auto renders_object_arguments =
-                try_render({
+                try_raw_render({
                     {
                         {"role", "user"},
                         {"content", "Hey"}
@@ -106,10 +107,13 @@ class chat_template {
         }
         supports_parallel_tool_calls_ = source.find("tool_call_id") != std::string::npos;
 
-        supports_system_role_ = try_render({
+        supports_system_role_ = try_raw_render({
             {{"role", "system"}, {"content", "<System Needle>"}},
             {{"role", "user"},   {"content", "Hey"}}
         }, {}, false).find("<System Needle>") != std::string::npos;
+
+        requires_typed_content_ = try_raw_render({{{"role", "user"},   {"content", "Hey"}}}, {}, false).find("Hey") == std::string::npos
+            && try_raw_render({{{"role", "user"},   {"content", {{{"type", "text"}, {"text", "Hey"}}}}}}, {}, false).find("Hey") != std::string::npos;
     }
 
     const std::string & source() const { return source_; }
@@ -122,19 +126,34 @@ class chat_template {
         const nlohmann::ordered_json & messages,
         const nlohmann::ordered_json & tools,
         bool add_generation_prompt,
-        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
+        const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
+        bool adjust_inputs = true) const
     {
         json actual_messages;
 
         // First, "fix" messages so they have a chance to be rendered correctly by the template
 
-        if (requires_object_arguments_ || !supports_system_role_ || !supports_tools_) {
+        if (adjust_inputs && (requires_object_arguments_ || !supports_system_role_ || !supports_tools_ || requires_typed_content_)) {
             actual_messages = json::array();
 
+            auto add_message = [&](const json & msg) {
+                if (requires_typed_content_ && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
+                    actual_messages.push_back({
+                        {"role", msg.at("role")},
+                        {"content", {{
+                            {"type", "text"},
+                            {"text", msg.at("content")},
+                        }}},
+                    });
+                } else {
+                    actual_messages.push_back(msg);
+                }
+            };
+
             std::string pending_system;
             auto flush_sys = [&]() {
                 if (!pending_system.empty()) {
-                    actual_messages.push_back({
+                    add_message({
                         {"role", "user"},
                         {"content", pending_system},
                     });
@@ -217,7 +236,7 @@ class chat_template {
                         }
                     }
                 }
-                actual_messages.push_back(message);
+                add_message(message);
             }
             flush_sys();
         } else {
 
@@ -693,7 +693,7 @@ enum SpaceHandling { Keep, Strip, StripSpaces, StripNewline };
 
 class TemplateToken {
 public:
-    enum class Type { Text, Expression, If, Else, Elif, EndIf, For, EndFor, Set, EndSet, Comment, Macro, EndMacro, Filter, EndFilter };
+    enum class Type { Text, Expression, If, Else, Elif, EndIf, For, EndFor, Generation, EndGeneration, Set, EndSet, Comment, Macro, EndMacro, Filter, EndFilter };
 
     static std::string typeToString(Type t) {
         switch (t) {
@@ -712,6 +712,8 @@ class TemplateToken {
             case Type::EndMacro: return "endmacro";
             case Type::Filter: return "filter";
             case Type::EndFilter: return "endfilter";
+            case Type::Generation: return "generation";
+            case Type::EndGeneration: return "endgeneration";
         }
         return "Unknown";
     }
@@ -788,6 +790,14 @@ struct EndForTemplateToken : public TemplateToken {
     EndForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, location, pre, post) {}
 };
 
+struct GenerationTemplateToken : public TemplateToken {
+    GenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, location, pre, post) {}
+};
+
+struct EndGenerationTemplateToken : public TemplateToken {
+    EndGenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, location, pre, post) {}
+};
+
 struct SetTemplateToken : public TemplateToken {
     std::string ns;
     std::vector<std::string> var_names;
@@ -2149,7 +2159,7 @@ class Parser {
       static std::regex comment_tok(R"(\{#([-~]?)(.*?)([-~]?)#\})");
       static std::regex expr_open_regex(R"(\{\{([-~])?)");
       static std::regex block_open_regex(R"(^\{%([-~])?[\s\n\r]*)");
-      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|set|endset|block|endblock|macro|endmacro|filter|endfilter)\b)");
+      static std::regex block_keyword_tok(R"((if|else|elif|endif|for|endfor|generation|endgeneration|set|endset|block|endblock|macro|endmacro|filter|endfilter)\b)");
       static std::regex non_text_open_regex(R"(\{\{|\{%|\{#)");
       static std::regex expr_close_regex(R"([\s\n\r]*([-~])?\}\})");
       static std::regex block_close_regex(R"([\s\n\r]*([-~])?%\})");
@@ -2229,6 +2239,12 @@ class Parser {
             } else if (keyword == "endfor") {
               auto post_space = parseBlockClose();
               tokens.push_back(std::make_unique<EndForTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "generation") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<GenerationTemplateToken>(location, pre_space, post_space));
+            } else if (keyword == "endgeneration") {
+              auto post_space = parseBlockClose();
+              tokens.push_back(std::make_unique<EndGenerationTemplateToken>(location, pre_space, post_space));
             } else if (keyword == "set") {
               static std::regex namespaced_var_regex(R"((\w+)[\s\n\r]*\.[\s\n\r]*(\w+))");
 
@@ -2330,6 +2346,13 @@ class Parser {
                   throw unterminated(**start);
               }
               children.emplace_back(std::make_shared<ForNode>(token->location, std::move(for_token->var_names), std::move(for_token->iterable), std::move(for_token->condition), std::move(body), for_token->recursive, std::move(else_body)));
+          } else if (dynamic_cast<GenerationTemplateToken*>(token.get())) {
+              auto body = parseTemplate(begin, it, end);
+              if (it == end || (*(it++))->type != TemplateToken::Type::EndGeneration) {
+                  throw unterminated(**start);
+              }
+              // Treat as a no-op, as our scope is templates for inference, not training (`{% generation %}` wraps generated tokens for masking).
+              children.emplace_back(std::move(body));
           } else if (auto text_token = dynamic_cast<TextTemplateToken*>(token.get())) {
               SpaceHandling pre_space = (it - 1) != begin ? (*(it - 2))->post_space : SpaceHandling::Keep;
               SpaceHandling post_space = it != end ? (*it)->pre_space : SpaceHandling::Keep;
@@ -2397,6 +2420,7 @@ class Parser {
                   || dynamic_cast<EndFilterTemplateToken*>(token.get())
                   || dynamic_cast<EndIfTemplateToken*>(token.get())
                   || dynamic_cast<ElseTemplateToken*>(token.get())
+                  || dynamic_cast<EndGenerationTemplateToken*>(token.get())
                   || dynamic_cast<ElifTemplateToken*>(token.get())) {
               it--;  // unconsume the token
               break;  // exit the loop
 
@@ -310,9 +310,9 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 ### Batch Size
 
--   `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`.
 
-- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
+- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`.
 
 ### Prompt Caching
 
 
@@ -147,7 +147,8 @@ class Opt {
                 if (handle_option_with_value(argc, argv, i, context_size) == 1) {
                     return 1;
                 }
-            } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
+            } else if (options_parsing &&
+                       (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
                 if (handle_option_with_value(argc, argv, i, ngl) == 1) {
                     return 1;
                 }
@@ -194,7 +195,7 @@ class Opt {
             "Options:\n"
             "  -c, --context-size <value>\n"
             "      Context size (default: %d)\n"
-            "  -n, --ngl <value>\n"
+            "  -n, -ngl, --ngl <value>\n"
             "      Number of GPU layers (default: %d)\n"
             "  --temp <value>\n"
             "      Temperature (default: %.1f)\n"
@@ -634,20 +635,20 @@ class LlamaData {
         return path.substr(pos + 1);
     }
 
-    int remove_proto(std::string & model_) {
-        const std::string::size_type pos = model_.find("://");
+    int rm_until_substring(std::string & model_, const std::string & substring) {
+        const std::string::size_type pos = model_.find(substring);
         if (pos == std::string::npos) {
             return 1;
         }
 
-        model_ = model_.substr(pos + 3);  // Skip past "://"
+        model_ = model_.substr(pos + substring.size());  // Skip past the substring
         return 0;
     }
 
     int resolve_model(std::string & model_) {
         int                            ret     = 0;
         if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
-            remove_proto(model_);
+            rm_until_substring(model_, "://");
 
             return ret;
         }
@@ -656,13 +657,16 @@ class LlamaData {
         const std::vector<std::string> headers = { "--header",
                                                    "Accept: application/vnd.docker.distribution.manifest.v2+json" };
         if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
-            remove_proto(model_);
+            rm_until_substring(model_, "://");
+            ret = huggingface_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "hf.co/")) {
+            rm_until_substring(model_, "hf.co/");
             ret = huggingface_dl(model_, headers, bn);
         } else if (string_starts_with(model_, "ollama://")) {
-            remove_proto(model_);
+            rm_until_substring(model_, "://");
             ret = ollama_dl(model_, headers, bn);
         } else if (string_starts_with(model_, "https://")) {
-            download(model_, headers, bn, true);
+            ret = download(model_, headers, bn, true);
         } else {
             ret = ollama_dl(model_, headers, bn);
         }
 
@@ -1433,6 +1433,10 @@ struct server_queue {
         } else {
             queue_tasks.push_back(std::move(task));
         }
+        // if this is cancel task make sure to clean up pending tasks
+        if (task.type == SERVER_TASK_TYPE_CANCEL) {
+            cleanup_pending_task(task.id_target);
+        }
         condition_tasks.notify_one();
         return task.id;
     }
@@ -1450,6 +1454,10 @@ struct server_queue {
             } else {
                 queue_tasks.push_back(std::move(task));
             }
+            // if this is cancel task make sure to clean up pending tasks
+            if (task.type == SERVER_TASK_TYPE_CANCEL) {
+                cleanup_pending_task(task.id_target);
+            }
         }
         condition_tasks.notify_one();
         return 0;
@@ -1544,6 +1552,20 @@ struct server_queue {
             }
         }
     }
+
+private:
+    void cleanup_pending_task(int id_task) {
+        // no need lock because this is called exclusively by post()
+        auto rm_func = [id_task](const server_task & task) {
+            return task.id_target == id_task;
+        };
+        queue_tasks.erase(
+            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
+            queue_tasks.end());
+        queue_tasks_deferred.erase(
+            std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
+            queue_tasks_deferred.end());
+    }
 };
 
 struct server_response {
@@ -1579,6 +1601,12 @@ struct server_response {
 
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.erase(id_task);
+        // make sure to clean up all pending results
+        queue_results.erase(
+            std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
+                return res->id == id_task;
+            }),
+            queue_results.end());
     }
 
     void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
@@ -1598,7 +1626,7 @@ struct server_response {
                 return !queue_results.empty();
             });
 
-            for (int i = 0; i < (int) queue_results.size(); i++) {
+            for (size_t i = 0; i < queue_results.size(); i++) {
                 if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
                     server_task_result_ptr res = std::move(queue_results[i]);
                     queue_results.erase(queue_results.begin() + i);
@@ -1615,12 +1643,6 @@ struct server_response {
     server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
         while (true) {
             std::unique_lock<std::mutex> lock(mutex_results);
-            bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{
-                return !queue_results.empty();
-            });
-            if (!cr_res) {
-                return nullptr;
-            }
 
             for (int i = 0; i < (int) queue_results.size(); i++) {
                 if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
@@ -1629,6 +1651,11 @@ struct server_response {
                     return res;
                 }
             }
+
+            std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
+            if (cr_res == std::cv_status::timeout) {
+                return nullptr;
+            }
         }
 
         // should never reach here
@@ -1772,6 +1799,9 @@ struct server_context {
             // force F16 KV cache for the draft model for extra performance
             cparams_dft.type_k = GGML_TYPE_F16;
             cparams_dft.type_v = GGML_TYPE_F16;
+
+            // the context is not needed - we will create one for each slot
+            llama_init_dft.context.reset();
         }
 
         chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
@@ -2373,8 +2403,8 @@ struct server_context {
 
             server_task task(SERVER_TASK_TYPE_CANCEL);
             task.id_target = id_task;
-            cancel_tasks.push_back(task);
             queue_results.remove_waiting_task_id(id_task);
+            cancel_tasks.push_back(task);
         }
         // push to beginning of the queue, so it has highest priority
         queue_tasks.post(cancel_tasks, true);