ggml-org
diff --git a/‎.github/workflows/server.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/server.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/arg.cpp‎
Lines changed: 8 additions & 1 deletion b/‎common/arg.cpp‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎examples/quantize/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/quantize/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/server/README.md‎
Lines changed: 10 additions & 9 deletions b/‎examples/server/README.md‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎examples/server/public/index.html‎
Lines changed: 118 additions & 80 deletions b/‎examples/server/public/index.html‎
Lines changed: 118 additions & 80 deletions
diff --git a/‎examples/server/server.cpp‎
Lines changed: 17 additions & 13 deletions b/‎examples/server/server.cpp‎
Lines changed: 17 additions & 13 deletions
diff --git a/‎examples/server/tests/unit/test_basic.py‎
Lines changed: 18 additions & 0 deletions b/‎examples/server/tests/unit/test_basic.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎examples/server/tests/utils.py‎
Lines changed: 3 additions & 0 deletions b/‎examples/server/tests/utils.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/server/utils.hpp‎
Lines changed: 1 addition & 1 deletion b/‎examples/server/utils.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/server/webui/index.html‎
Lines changed: 81 additions & 43 deletions b/‎examples/server/webui/index.html‎
Lines changed: 81 additions & 43 deletions
@@ -79,7 +79,7 @@ jobs:
       # Setup nodejs (to be used for verifying bundled index.html)
       - uses: actions/setup-node@v4
         with:
-          node-version: 22
+          node-version: '22.11.0'
 
       - name: Verify bundled index.html
         id: verify_server_index_html
 
@@ -591,7 +591,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
         string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1711,6 +1711,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.public_path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
+    add_opt(common_arg(
+        {"--no-webui"},
+        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.webui = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
     add_opt(common_arg(
         {"--embedding", "--embeddings"},
         string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
 
@@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d
   - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
   - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
   - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
+  - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
   - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
   - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
   - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
 
@@ -146,6 +146,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
+| `--no-webui` | disable the Web UI<br/>(env: LLAMA_ARG_NO_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
 | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@@ -302,23 +303,23 @@ mkdir llama-client
 cd llama-client
 ```
 
-Create a index.js file and put this inside:
+Create an index.js file and put this inside:
 
 ```javascript
-const prompt = `Building a website can be done in 10 simple steps:`;
+const prompt = "Building a website can be done in 10 simple steps:"
 
-async function Test() {
+async function test() {
     let response = await fetch("http://127.0.0.1:8080/completion", {
-        method: 'POST',
+        method: "POST",
         body: JSON.stringify({
             prompt,
-            n_predict: 512,
+            n_predict: 64,
         })
     })
     console.log((await response.json()).content)
 }
 
-Test()
+test()
 ```
 
 And run it:
@@ -380,7 +381,7 @@ Multiple prompts are also supported. In this case, the completion result will be
 `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
 By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
 
-`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
+`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
 
 `stop`: Specify a JSON array of stopping strings.
 These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
@@ -441,11 +442,11 @@ These words will not be included in the completion, so make sure to add them to
 
 `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 
-    `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
+`timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
 
 **Response format**
 
-- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
+- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
 
 - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
 
 
@@ -3815,20 +3815,24 @@ int main(int argc, char ** argv) {
     // Router
     //
 
-    // register static assets routes
-    if (!params.public_path.empty()) {
-        // Set the base directory for serving static files
-        bool is_found = svr->set_mount_point("/", params.public_path);
-        if (!is_found) {
-            LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
-            return 1;
-        }
+    if (!params.webui) {
+        LOG_INF("Web UI is disabled\n");
     } else {
-        // using embedded static index.html
-        svr->Get("/", [](const httplib::Request &, httplib::Response & res) {
-            res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
-            return false;
-        });
+        // register static assets routes
+        if (!params.public_path.empty()) {
+            // Set the base directory for serving static files
+            bool is_found = svr->set_mount_point("/", params.public_path);
+            if (!is_found) {
+                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+                return 1;
+            }
+        } else {
+            // using embedded static index.html
+            svr->Get("/", [](const httplib::Request &, httplib::Response & res) {
+                res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
+                return false;
+            });
+        }
     }
 
     // register API routes
 
@@ -1,4 +1,5 @@
 import pytest
+import requests
 from utils import *
 
 server = ServerPreset.tinyllama2()
@@ -76,3 +77,20 @@ def test_load_split_model():
     })
     assert res.status_code == 200
     assert match_regex("(little|girl)+", res.body["content"])
+
+
+def test_no_webui():
+    global server
+    # default: webui enabled
+    server.start()
+    url = f"http://{server.server_host}:{server.server_port}"
+    res = requests.get(url)
+    assert res.status_code == 200
+    assert "<html>" in res.text
+    server.stop()
+
+    # with --no-webui
+    server.no_webui = True
+    server.start()
+    res = requests.get(url)
+    assert res.status_code == 404
@@ -72,6 +72,7 @@ class ServerProcess:
     disable_ctx_shift: int | None = False
     draft_min: int | None = None
     draft_max: int | None = None
+    no_webui: bool | None = None
 
     # session variables
     process: subprocess.Popen | None = None
@@ -158,6 +159,8 @@ def start(self, timeout_seconds: int = 10) -> None:
             server_args.extend(["--draft-max", self.draft_max])
         if self.draft_min:
             server_args.extend(["--draft-min", self.draft_min])
+        if self.no_webui:
+            server_args.append("--no-webui")
 
         args = [str(arg) for arg in [server_path, *server_args]]
         print(f"bench: starting server with: {' '.join(args)}")
 
@@ -333,7 +333,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) {
     if (res < 2) {
         return "";
     } else {
-        std::vector<char> model_template(res, 0);
+        std::vector<char> model_template(res + 1, 0);
         llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
         return std::string(model_template.data(), model_template.size() - 1);
     }
 
@@ -15,7 +15,7 @@
       <!-- sidebar -->
       <div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
         <label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
-        <div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
+        <div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
           <div class="flex flex-row items-center justify-between mb-4 mt-4">
             <h2 class="font-bold ml-4">Conversations</h2>
 
@@ -120,51 +120,25 @@ <h2 class="font-bold ml-4">Conversations</h2>
             {{ messages.length === 0 ? 'Send a message to start' : '' }}
           </div>
           <div v-for="msg in messages" class="group">
-            <div :class="{
-              'chat': true,
-              'chat-start': msg.role !== 'user',
-              'chat-end': msg.role === 'user',
-            }">
-              <div :class="{
-                'chat-bubble markdown': true,
-                'chat-bubble-base-300': msg.role !== 'user',
-              }">
-                <!-- textarea for editing message -->
-                <template v-if="editingMsg && editingMsg.id === msg.id">
-                  <textarea
-                    class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
-                    v-model="msg.content"></textarea>
-                  <br/>
-                  <button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
-                  <button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
-                </template>
-                <!-- render message as markdown -->
-                <vue-markdown v-else :source="msg.content" />
-              </div>
-            </div>
-
-            <!-- actions for each message -->
-            <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
-              <!-- user message -->
-              <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
-                ✍️ Edit
-              </button>
-              <!-- assistant message -->
-              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
-                🔄 Regenerate
-              </button>
-              <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
-                📋 Copy
-              </button>
-            </div>
+            <message-bubble
+              :config="config"
+              :msg="msg"
+              :key="msg.id"
+              :is-generating="isGenerating"
+              :edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
+              :regenerate-msg="regenerateMsg"></message-bubble>
           </div>
 
           <!-- pending (ongoing) assistant message -->
-          <div id="pending-msg" class="chat chat-start">
-            <div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
-              <span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
-              <vue-markdown v-else :source="pendingMsg.content" />
-            </div>
+          <div id="pending-msg" class="group">
+            <message-bubble
+              v-if="pendingMsg"
+              :config="config"
+              :msg="pendingMsg"
+              :key="pendingMsg.id"
+              :is-generating="isGenerating"
+              :edit-user-msg-and-regenerate="() => {}"
+              :regenerate-msg="() => {}"></message-bubble>
           </div>
         </div>
 
@@ -227,6 +201,10 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
           <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
             <summary class="collapse-title font-bold">Advanced config</summary>
             <div class="collapse-content">
+              <div class="flex flex-row items-center mb-2">
+                <input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
+                <span class="ml-4">Show tokens per second</span>
+              </div>
               <label class="form-control mb-2">
                 <!-- Custom parameters input -->
                 <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
@@ -247,6 +225,66 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
 
   </div>
 
+
+  <!-- Template to be used as message bubble -->
+  <template id="message-bubble">
+    <div :class="{
+      'chat': true,
+      'chat-start': msg.role !== 'user',
+      'chat-end': msg.role === 'user',
+    }">
+      <div :class="{
+        'chat-bubble markdown': true,
+        'chat-bubble-base-300': msg.role !== 'user',
+      }">
+        <!-- textarea for editing message -->
+        <template v-if="editingContent !== null">
+          <textarea
+            class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
+            v-model="editingContent"></textarea>
+          <br/>
+          <button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
+          <button class="btn mt-2" @click="editMsg()">Submit</button>
+        </template>
+        <template v-else>
+          <!-- show loading dots for pending message -->
+          <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
+          <!-- render message as markdown -->
+          <vue-markdown v-else :source="msg.content"></vue-markdown>
+          <!-- render timings if enabled -->
+          <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
+            <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
+            <div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
+              <b>Prompt</b><br/>
+              - Tokens: {{ timings.prompt_n }}<br/>
+              - Time: {{ timings.prompt_ms }} ms<br/>
+              - Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
+              <b>Generation</b><br/>
+              - Tokens: {{ timings.predicted_n }}<br/>
+              - Time: {{ timings.predicted_ms }} ms<br/>
+              - Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
+            </div>
+          </div>
+        </template>
+      </div>
+    </div>
+    <!-- actions for each message -->
+    <div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
+      <!-- user message -->
+      <button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
+        ✍️ Edit
+      </button>
+      <!-- assistant message -->
+      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+        🔄 Regenerate
+      </button>
+      <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
+        📋 Copy
+      </button>
+    </div>
+  </template>
+
+
   <!-- Template to be used by settings modal -->
   <template id="settings-modal-short-input">
     <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
Original file line number	Diff line number	Diff line change
`@@ -333,7 +333,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) {`
`333`	`333`	`if (res < 2) {`
`334`	`334`	`return "";`
`335`	`335`	`} else {`
`336`		`- std::vector<char> model_template(res, 0);`
	`336`	`+ std::vector<char> model_template(res + 1, 0);`
`337`	`337`	`llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());`
`338`	`338`	`return std::string(model_template.data(), model_template.size() - 1);`
`339`	`339`	`}`