ggml-org · Roshankumarb31 · Oct 12, 2025 · Oct 12, 2025 · Oct 12, 2025
@@ -1032,6 +1032,50 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
 
 </details>
 
+
+### GET `/slots/status`: Secure slot monitoring without sensitive data
+
+This endpoint provides slot monitoring information suitable for production environments, load balancers, and monitoring dashboards. Unlike `/slots`, it does not expose sensitive user data such as prompts or generated text.
+
+**Response format:**
+
+```json
+{
+  "slots": [
+    {
+      "id": 0,
+      "state": "idle",
+      "is_processing": false
+    },
+    {
+      "id": 1,
+      "state": "processing",
+      "is_processing": true,
+      "n_ctx": 2048,
+      "n_past": 128,
+      "n_decoded": 45,
+      "n_remaining": 155,
+      "truncated": false
+    }
+    ],
+  "total_slots": 4,
+  "idle_slots": 3,
+  "processing_slots": 1,
+  "queue_size": 2,
+  "server_uptime_ms": 45231.5
+}
+```
+
+
+**Use cases:**
+- Monitoring dashboards that need capacity information
+- Load balancers (e.g., Paddler) for request routing
+- Capacity planning and resource allocation
+- Production deployments where prompt privacy is required
+
+**Security:** This endpoint excludes all sensitive data including prompts, generated text, tokens, and detailed task parameters.
+
+
 ### GET `/metrics`: Prometheus compatible metrics exporter
 
 This endpoint is only accessible if `--metrics` is set.

@@ -4624,6 +4624,68 @@ int main(int argc, char ** argv) {
         res_ok(res, res_task->slots_data);
     };
 
+    // Secure slots monitoring endpoint - exposes only non-sensitive slot data
+    const auto handle_slots_status = [&](const httplib::Request &req, httplib::Response &res) {
+        if (!params.endpoint_slots) {
+            res_error(res, format_error_response("This server does not support slots endpoint. Start it with --slots", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }
+
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        server_task task(SERVER_TASK_TYPE_METRICS);
+        task.id = task_id;
+
+        ctx_server.queue_results.add_waiting_task_id(task_id);
+        ctx_server.queue_tasks.post(std::move(task), true);
+
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res_error(res, result->to_json());
+            return;
+        }
+
+        auto res_task = dynamic_cast<server_task_result_metrics *>(result.get());
+        GGML_ASSERT(res_task != nullptr);
+
+        json slots_status = json::array();  // build response with only safe data
+        int n_idle = 0;
+        int n_processing = 0;
+
+        for (const auto &slot_data : res_task->slots_data) {
+            json secure_slot;
+
+            secure_slot["id"] = slot_data["id"];
+            secure_slot["state"] = slot_data["state"];
+            secure_slot["is_processing"] = slot_data["is_processing"];
+
+            if (slot_data["is_processing"]) {
+                secure_slot["n_ctx"] = slot_data["n_ctx"];
+                secure_slot["n_past"] = slot_data["n_past"];
+                secure_slot["n_decoded"] = slot_data["n_decoded"];
+                secure_slot["n_remaining"] = slot_data["n_remaining"];
+                secure_slot["truncated"] = slot_data["truncated"];
+                n_processing++;
+            } else {
+                n_idle++;
+            }
+
+            slots_status.push_back(secure_slot);
+        }
+
+        json response;
+        response["slots"] = slots_status;
+        response["total_slots"] = res_task->slots_data.size();
+        response["idle_slots"] = n_idle;
+        response["processing_slots"] = n_processing;
+        response["queue_size"] = res_task->n_tasks_deferred;
+        response["server_uptime_ms"] = (ggml_time_us() - res_task->t_start) / 1000.0;
+
+        res_ok(res, response);
+    };
+
+
     const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
         if (!params.endpoint_metrics) {
             res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
@@ -5596,6 +5658,7 @@ int main(int argc, char ** argv) {
     svr->Post(params.api_prefix + "/lora-adapters",       handle_lora_adapters_apply);
     // Save & load slots
     svr->Get (params.api_prefix + "/slots",               handle_slots);
+    svr->Get(params.api_prefix + "/slots/status",         handle_slots_status);
     svr->Post(params.api_prefix + "/slots/:id_slot",      handle_slots_action);
 
     //

diff --git a/tools/server/tests/test_slots_status.py b/tools/server/tests/test_slots_status.py
@@ -0,0 +1,34 @@
+import pytest
+from utils import *
+
+# Test the new secure slots monitoring endpoint
+def test_slots_status_basic():
+    global server
+    res = server.make_request("GET", "/slots/status")
+    assert res.status_code == 200
+
+def test_slots_status_structure():
+    global server
+    res = server.make_request("GET", "/slots/status")
+    data = res.body
+
+    # Check response structure
+    assert "slots" in data
+    assert "total_slots" in data
+    assert "idle_slots" in data
+    assert "processing_slots" in data
+    assert isinstance(data["slots"], list)
+
+def test_slots_status_no_sensitive_data():
+    """Critical security test: ensure no sensitive data leakage"""
+    global server
+    res = server.make_request("GET", "/slots/status")
+    data = res.body
+
+    for slot in data["slots"]:
+        # These fields must NEVER appear in the response
+        assert "prompt" not in slot
+        assert "generated_text" not in slot
+        assert "generated" not in slot
+        assert "tokens" not in slot
+        assert "cache_tokens" not in slot