api: support llama.cpp webui

foldl · foldl · commit 8c8f846903be · 2025-11-19T17:02:44.000+08:00
diff --git a/docs/binding.md b/docs/binding.md
@@ -56,9 +56,12 @@ streamlit run chatllm_st.py -- -i -m path/to/model
 
 Note: "STOP" function is not implemented yet.
 
-### OpenAI/Ollama Compatible API
+### OpenAI/Ollama/llama.cpp Compatible API
 
-[Here](../scripts/openai_api.py) is a server providing some OpenAI/Ollama Compatible API. Note that most of
+> [!IMPORTANT]
+> This is going to be re-written by Nim. Only basic functionalities are provided.
+
+[Here](../scripts/openai_api.py) is a server (default port: 11434) providing some OpenAI/Ollama/llama.cpp Compatible API. Note that most of
 the parameters are ignored.
 
 `openai_api.py` supports loading several types models for chatting, code completion (FIM), or text embedding etc.
@@ -84,6 +87,8 @@ Some base models that can be used for code completion:
 * CodeGemma v1.1: [Base-2B](https://huggingface.co/google/codegemma-1.1-2b), [Base-7B](https://huggingface.co/google/codegemma-1.1-7b)
 * StarCoder2: [Base-3B](https://huggingface.co/bigcode/starcoder2-7b), [Base-7B](https://huggingface.co/bigcode/starcoder2-7b), [Base-15B](https://huggingface.co/bigcode/starcoder2-15b) (not recommended)
 
+#### Ollama
+
 This module provides sufficient Ollama API so that it can be used to emulate Ollama model provider in Visual Studio Code Copilot.
 For example, starting the server with a model:
 
@@ -95,6 +100,15 @@ Select the model from Ollama provider:
 
 ![](vscode_ollama.png)
 
+#### WebUI
+
+`openai_api.py` provides a WebUI. WebUI of [llama.cpp](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) is also (partly) supported.
+Use `--ui` to select WebUI llama.cpp:
+
+```sh
+python openai_api.py --ui /path/to/index.html.gz ---chat :qwen2.5
+```
+
 ## JavaScript/TypeScript
 
 ### Command line
diff --git a/scripts/openai_api.py b/scripts/openai_api.py
@@ -182,6 +182,7 @@ def make_id(self) -> str:
 http_server: HTTPServer = None
 
 model_info = {}
+ui_file_name = 'chat_ui.html'
 
 def get_streamer(model: str) -> ChatLLMStreamer | None:
     if model.endswith('fim') or model.startswith('fim'):
@@ -375,14 +376,71 @@ def handle_TAGS(self, obj: dict):
         self.wfile.flush()
 
     def handle_UI(self, obj: dict):
+        fn = ui_file_name
+        if not os.path.isfile(fn):
+            fn = 'scripts/' + fn
+        assert os.path.isfile(fn)
+
         self.send_response(200)
+        if fn.endswith('.gz'):
+            self.send_header("Content-Encoding", "gzip")
         self.send_header('Content-type', 'text/html; charset=utf-8')
         self.end_headers()
-        fn = 'chat_ui.html'
-        if not os.path.isfile('chat_ui.html'):
-            fn = 'scripts/' + fn
-        with open(fn, 'r', encoding='utf-8') as f:
-            self.wfile.write(f.read().encode())
+
+        with open(fn, 'rb') as f:
+            self.wfile.write(f.read())
+        self.wfile.flush()
+
+    def handle_llama_props(self, obj: dict):
+        global model_info
+        capabilities = model_info['chat']['capabilities']
+        modalities = {
+            "vision": "Image Input" in capabilities
+        }
+        rsp = {
+            "default_generation_settings":  "",
+            "total_slots":                  1,
+            "model_alias":                  model_info['chat']['name'],
+            "model_path":                   "",
+            "modalities":                   modalities,
+            "endpoint_slots":               0,
+            "endpoint_props":               {},
+            "endpoint_metrics":             0,
+            "webui":                        0,
+            "chat_template":                "",
+            "bos_token":                    [],
+            "eos_token":                    [],
+            "build_info":                   "Today",
+        }
+        self.send_response(200)
+        self.send_header('Content-type', 'application/json')
+        self.end_headers()
+        self.wfile.write(json.dumps(rsp, indent=True).encode('utf-8'))
+        self.wfile.flush()
+
+    def handle_llama_slots(self, obj: dict):
+        global model_info
+        rsp = [
+                {
+                    "id": 0,
+                    "id_task": 1,
+                    "n_ctx": model_info['chat']['context_length'],
+                    "speculative": False,
+                    "is_processing": False,
+                    "params": {
+                        "n_predict": -1,
+                        "seed": 1,
+                        "temperature": 0.8,
+                        "dynatemp_range": 0.0,
+                        "dynatemp_exponent": 1.0,
+                        "top_k": 40,
+                    }
+                }
+        ]
+        self.send_response(200)
+        self.send_header('Content-type', 'application/json')
+        self.end_headers()
+        self.wfile.write(json.dumps(rsp, indent=True).encode('utf-8'))
         self.wfile.flush()
 
     def do_GET(self):
@@ -393,9 +451,15 @@ def do_GET(self):
         elif self.path.endswith('/tags'):
             self.handle_TAGS({})
             return
-        elif self.path.endswith('/ui'):
+        elif self.path.endswith('/props'):
+            self.handle_llama_props({})
+            return
+        elif self.path.endswith('/ui') or self.path.startswith('/?') or (self.path in ['', '/']):
             self.handle_UI({})
             return
+        elif self.path.startswith('/slots'):
+            self.handle_llama_slots({})
+            return
         else:
             self.send_error(404, 'NOT FOUND')
             return
@@ -419,13 +483,29 @@ def do_OPTIONS(self):
     signal.signal(signal.SIGINT, handler)
 
     ARG_SEP = '---'
+    port    = 11434
 
     args = sys.argv[1:]
     if len(args) < 1:
-        print(f"usage: python openai_api.py [{ARG_SEP}TYPE path/to/model [additional args]]")
+        print(f"usage: python openai_api.py [app_args] [{ARG_SEP}TYPE path/to/model [additional args]]")
+        print(f"where app_args :: --ui /path/to/ui --port PORT")
         print('where TYPE ::= chat | fim | emb')
         exit(-1)
 
+    while len(args) > 0:
+        if args[0] == '--ui':
+            args.pop(0)
+            assert len(args) > 0
+            ui_file_name = args[0]
+            args.pop(0)
+        if args[0] == '--port':
+            args.pop(0)
+            assert len(args) > 0
+            port = int(args[0])
+            args.pop(0)
+        else:
+            break
+
     chat_args = ['-m']
     fim_args = ['-m']
     emb_args = ['-m']
@@ -461,6 +541,7 @@ def do_OPTIONS(self):
 
     print(model_info)
 
-    print("LLM Loaded. Starting server...")
-    http_server = HTTPServer(('0.0.0.0', 11434), HttpHandler)
+    print(f"LLM Loaded. Starting server on port {port}...")
+    print(f"http://localhost:{port}")
+    http_server = HTTPServer(('0.0.0.0', port), HttpHandler)
     http_server.serve_forever()
diff --git a/scripts/server.nim b/scripts/server.nim