diff --git a/tools/server/README.md b/tools/server/README.md index f5ab9236d5216..ebcdd943cff31 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1032,6 +1032,50 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat + +### GET `/slots/status`: Secure slot monitoring without sensitive data + +This endpoint provides slot monitoring information suitable for production environments, load balancers, and monitoring dashboards. Unlike `/slots`, it does not expose sensitive user data such as prompts or generated text. + +**Response format:** + +```json +{ + "slots": [ + { + "id": 0, + "state": "idle", + "is_processing": false + }, + { + "id": 1, + "state": "processing", + "is_processing": true, + "n_ctx": 2048, + "n_past": 128, + "n_decoded": 45, + "n_remaining": 155, + "truncated": false + } + ], + "total_slots": 4, + "idle_slots": 3, + "processing_slots": 1, + "queue_size": 2, + "server_uptime_ms": 45231.5 +} +``` + + +**Use cases:** +- Monitoring dashboards that need capacity information +- Load balancers (e.g., Paddler) for request routing +- Capacity planning and resource allocation +- Production deployments where prompt privacy is required + +**Security:** This endpoint excludes all sensitive data including prompts, generated text, tokens, and detailed task parameters. + + ### GET `/metrics`: Prometheus compatible metrics exporter This endpoint is only accessible if `--metrics` is set. diff --git a/tools/server/server.cpp b/tools/server/server.cpp index cf12805b4998a..1fdd1889e6661 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4624,6 +4624,68 @@ int main(int argc, char ** argv) { res_ok(res, res_task->slots_data); }; + // Secure slots monitoring endpoint - exposes only non-sensitive slot data + const auto handle_slots_status = [&](const httplib::Request &req, httplib::Response &res) { + if (!params.endpoint_slots) { + res_error(res, format_error_response("This server does not support slots endpoint. Start it with --slots", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + + int task_id = ctx_server.queue_tasks.get_new_id(); + server_task task(SERVER_TASK_TYPE_METRICS); + task.id = task_id; + + ctx_server.queue_results.add_waiting_task_id(task_id); + ctx_server.queue_tasks.post(std::move(task), true); + + server_task_result_ptr result = ctx_server.queue_results.recv(task_id); + ctx_server.queue_results.remove_waiting_task_id(task_id); + + if (result->is_error()) { + res_error(res, result->to_json()); + return; + } + + auto res_task = dynamic_cast(result.get()); + GGML_ASSERT(res_task != nullptr); + + json slots_status = json::array(); // build response with only safe data + int n_idle = 0; + int n_processing = 0; + + for (const auto &slot_data : res_task->slots_data) { + json secure_slot; + + secure_slot["id"] = slot_data["id"]; + secure_slot["state"] = slot_data["state"]; + secure_slot["is_processing"] = slot_data["is_processing"]; + + if (slot_data["is_processing"]) { + secure_slot["n_ctx"] = slot_data["n_ctx"]; + secure_slot["n_past"] = slot_data["n_past"]; + secure_slot["n_decoded"] = slot_data["n_decoded"]; + secure_slot["n_remaining"] = slot_data["n_remaining"]; + secure_slot["truncated"] = slot_data["truncated"]; + n_processing++; + } else { + n_idle++; + } + + slots_status.push_back(secure_slot); + } + + json response; + response["slots"] = slots_status; + response["total_slots"] = res_task->slots_data.size(); + response["idle_slots"] = n_idle; + response["processing_slots"] = n_processing; + response["queue_size"] = res_task->n_tasks_deferred; + response["server_uptime_ms"] = (ggml_time_us() - res_task->t_start) / 1000.0; + + res_ok(res, response); + }; + + const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { if (!params.endpoint_metrics) { res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED)); @@ -5596,6 +5658,7 @@ int main(int argc, char ** argv) { svr->Post(params.api_prefix + "/lora-adapters", handle_lora_adapters_apply); // Save & load slots svr->Get (params.api_prefix + "/slots", handle_slots); + svr->Get(params.api_prefix + "/slots/status", handle_slots_status); svr->Post(params.api_prefix + "/slots/:id_slot", handle_slots_action); // diff --git a/tools/server/tests/test_slots_status.py b/tools/server/tests/test_slots_status.py new file mode 100644 index 0000000000000..27ed8c09fded3 --- /dev/null +++ b/tools/server/tests/test_slots_status.py @@ -0,0 +1,34 @@ +import pytest +from utils import * + +# Test the new secure slots monitoring endpoint +def test_slots_status_basic(): + global server + res = server.make_request("GET", "/slots/status") + assert res.status_code == 200 + +def test_slots_status_structure(): + global server + res = server.make_request("GET", "/slots/status") + data = res.body + + # Check response structure + assert "slots" in data + assert "total_slots" in data + assert "idle_slots" in data + assert "processing_slots" in data + assert isinstance(data["slots"], list) + +def test_slots_status_no_sensitive_data(): + """Critical security test: ensure no sensitive data leakage""" + global server + res = server.make_request("GET", "/slots/status") + data = res.body + + for slot in data["slots"]: + # These fields must NEVER appear in the response + assert "prompt" not in slot + assert "generated_text" not in slot + assert "generated" not in slot + assert "tokens" not in slot + assert "cache_tokens" not in slot