account for both api and web browser requests

VJHack · VJHack · commit cb1338213611 · 2024-09-12T21:44:52.000-05:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2591,11 +2591,15 @@ int main(int argc, char ** argv) {
         return false;
     };
 
-    auto middleware_server_state = [&state](const httplib::Request &, httplib::Response & res) {
+    auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
         server_state current_state = state.load();
         if (current_state == SERVER_STATE_LOADING_MODEL) {
-            res.set_content("<html><body>The model is loading. Please wait.<br/>The user interface will appear soon.</body></html>", "text/html; charset=utf-8");
-            res.status = 503;
+            if(req.path == "/"){
+                res.set_content("<html><body>The model is loading. Please wait.<br/>The user interface will appear soon.</body></html>", "text/html; charset=utf-8");
+                res.status = 503;
+            } else {
+                res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+            }
             return false;
         }
         return true;
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
@@ -105,8 +105,16 @@ Feature: llama.cpp server
     Given first token is removed
     Then  tokens can be detokenized
 
+  Scenario: Tokenize with pieces
+    When  tokenizing with pieces:
+    """
+    What is the capital of Germany?
+    媽
+    """
+    Then  tokens are given with pieces
+
   Scenario: Models available
     Given available models
     Then  1 models are supported
     Then  model 0 is identified by tinyllama-2
-    Then  model 0 is trained on 128 tokens context
+    Then  model 0 is trained on 128 tokens context
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -1208,6 +1208,7 @@ async def wait_for_slots_status(context,
         while True:
             async with await session.get(f'{base_url}/slots', params=params) as slots_response:
                 status_code = slots_response.status
+                print(await slots_response.text())
                 slots = await slots_response.json()
                 if context.debug:
                     print(f"slots responses {slots}\n")
@@ -1372,4 +1373,4 @@ def server_log(in_stream, out_stream):
     thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
     thread_stderr.start()
 
-    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
+    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")