From 517c41ba1a58aeece98bb64e92b163f841b088f3 Mon Sep 17 00:00:00 2001 From: roshankumarb31 Date: Sun, 12 Oct 2025 17:46:03 +0530 Subject: [PATCH 1/3] server: add /slots/status endpoint for monitoring Adds a new endpoint that returns slot status without exposing sensitive data like prompts or generated text. Useful for load balancers and monitoring tools that need to check slot availability without accessing user data. Refs #11040 --- tools/server/server.cpp | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index cf12805b4998a..1fdd1889e6661 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4624,6 +4624,68 @@ int main(int argc, char ** argv) { res_ok(res, res_task->slots_data); }; + // Secure slots monitoring endpoint - exposes only non-sensitive slot data + const auto handle_slots_status = [&](const httplib::Request &req, httplib::Response &res) { + if (!params.endpoint_slots) { + res_error(res, format_error_response("This server does not support slots endpoint. Start it with --slots", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + + int task_id = ctx_server.queue_tasks.get_new_id(); + server_task task(SERVER_TASK_TYPE_METRICS); + task.id = task_id; + + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task), true); + + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res_error(res, result->to_json()); + return; + } + + auto res_task = dynamic_cast(result.get()); + GGML_ASSERT(res_task != nullptr); + + json slots_status = json::array(); // build response with only safe data + int n_idle = 0; + int n_processing = 0; + + for (const auto &slot_data : res_task->slots_data) { + json secure_slot; + + secure_slot["id"] = slot_data["id"]; + secure_slot["state"] = slot_data["state"]; + secure_slot["is_processing"] = slot_data["is_processing"]; + + if (slot_data["is_processing"]) { + secure_slot["n_ctx"] = slot_data["n_ctx"]; + secure_slot["n_past"] = slot_data["n_past"]; + secure_slot["n_decoded"] = slot_data["n_decoded"]; + secure_slot["n_remaining"] = slot_data["n_remaining"]; + secure_slot["truncated"] = slot_data["truncated"]; + n_processing++; + } else { + n_idle++; + } + + slots_status.push_back(secure_slot); + } + + json response; + response["slots"] = slots_status; + response["total_slots"] = res_task->slots_data.size(); + response["idle_slots"] = n_idle; + response["processing_slots"] = n_processing; + response["queue_size"] = res_task->n_tasks_deferred; + response["server_uptime_ms"] = (ggml_time_us() - res_task->t_start) / 1000.0; + + res_ok(res, response); + }; + + const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { if (!params.endpoint_metrics) { res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); @@ -5596,6 +5658,7 @@ int main(int argc, char ** argv) { svr->Post(params.api_prefix + "/lora-adapters", handle_lora_adapters_apply); // Save & load slots svr->Get (params.api_prefix + "/slots", handle_slots); + svr->Get(params.api_prefix + "/slots/status", handle_slots_status); svr->Post(params.api_prefix + "/slots/:id_slot", handle_slots_action); // From f9b39506d6ee640431bed943cca9352a5dc3d6e9 Mon Sep 17 00:00:00 2001 From: roshankumarb31 Date: Sun, 12 Oct 2025 19:01:10 +0530 Subject: [PATCH 2/3] docs: add documentation for /slots/status endpoint --- tools/server/README.md | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tools/server/README.md b/tools/server/README.md index f5ab9236d5216..ebcdd943cff31 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1032,6 +1032,50 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat + +### GET `/slots/status`: Secure slot monitoring without sensitive data + +This endpoint provides slot monitoring information suitable for production environments, load balancers, and monitoring dashboards. Unlike `/slots`, it does not expose sensitive user data such as prompts or generated text. + +**Response format:** + +```json +{ + "slots": [ + { + "id": 0, + "state": "idle", + "is_processing": false + }, + { + "id": 1, + "state": "processing", + "is_processing": true, + "n_ctx": 2048, + "n_past": 128, + "n_decoded": 45, + "n_remaining": 155, + "truncated": false + } + ], + "total_slots": 4, + "idle_slots": 3, + "processing_slots": 1, + "queue_size": 2, + "server_uptime_ms": 45231.5 +} +``` + + +**Use cases:** +- Monitoring dashboards that need capacity information +- Load balancers (e.g., Paddler) for request routing +- Capacity planning and resource allocation +- Production deployments where prompt privacy is required + +**Security:** This endpoint excludes all sensitive data including prompts, generated text, tokens, and detailed task parameters. + + ### GET `/metrics`: Prometheus compatible metrics exporter This endpoint is only accessible if `--metrics` is set. From 6090839da38b31a3a4bda9ff868bd60e2252db1e Mon Sep 17 00:00:00 2001 From: roshankumarb31 Date: Sun, 12 Oct 2025 20:38:23 +0530 Subject: [PATCH 3/3] test: add security tests for /slots/status endpoint --- tools/server/tests/test_slots_status.py | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tools/server/tests/test_slots_status.py diff --git a/tools/server/tests/test_slots_status.py b/tools/server/tests/test_slots_status.py new file mode 100644 index 0000000000000..27ed8c09fded3 --- /dev/null +++ b/tools/server/tests/test_slots_status.py @@ -0,0 +1,34 @@ +import pytest +from utils import * + +# Test the new secure slots monitoring endpoint +def test_slots_status_basic(): + global server + res = server.make_request("GET", "/slots/status") + assert res.status_code == 200 + +def test_slots_status_structure(): + global server + res = server.make_request("GET", "/slots/status") + data = res.body + + # Check response structure + assert "slots" in data + assert "total_slots" in data + assert "idle_slots" in data + assert "processing_slots" in data + assert isinstance(data["slots"], list) + +def test_slots_status_no_sensitive_data(): + """Critical security test: ensure no sensitive data leakage""" + global server + res = server.make_request("GET", "/slots/status") + data = res.body + + for slot in data["slots"]: + # These fields must NEVER appear in the response + assert "prompt" not in slot + assert "generated_text" not in slot + assert "generated" not in slot + assert "tokens" not in slot + assert "cache_tokens" not in slot