Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,50 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat

</details>


### GET `/slots/status`: Secure slot monitoring without sensitive data

This endpoint provides slot monitoring information suitable for production environments, load balancers, and monitoring dashboards. Unlike `/slots`, it does not expose sensitive user data such as prompts or generated text.

**Response format:**

```json
{
"slots": [
{
"id": 0,
"state": "idle",
"is_processing": false
},
{
"id": 1,
"state": "processing",
"is_processing": true,
"n_ctx": 2048,
"n_past": 128,
"n_decoded": 45,
"n_remaining": 155,
"truncated": false
}
],
"total_slots": 4,
"idle_slots": 3,
"processing_slots": 1,
"queue_size": 2,
"server_uptime_ms": 45231.5
}
```


**Use cases:**
- Monitoring dashboards that need capacity information
- Load balancers (e.g., Paddler) for request routing
- Capacity planning and resource allocation
- Production deployments where prompt privacy is required

**Security:** This endpoint excludes all sensitive data including prompts, generated text, tokens, and detailed task parameters.


### GET `/metrics`: Prometheus compatible metrics exporter

This endpoint is only accessible if `--metrics` is set.
Expand Down
63 changes: 63 additions & 0 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4624,6 +4624,68 @@ int main(int argc, char ** argv) {
res_ok(res, res_task->slots_data);
};

// Secure slots monitoring endpoint - exposes only non-sensitive slot data
const auto handle_slots_status = [&](const httplib::Request &req, httplib::Response &res) {
if (!params.endpoint_slots) {
res_error(res, format_error_response("This server does not support slots endpoint. Start it with --slots", ERROR_TYPE_NOT_SUPPORTED));
return;
}

int task_id = ctx_server.queue_tasks.get_new_id();
server_task task(SERVER_TASK_TYPE_METRICS);
task.id = task_id;

ctx_server.queue_results.add_waiting_task_id(task_id);
ctx_server.queue_tasks.post(std::move(task), true);

server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
ctx_server.queue_results.remove_waiting_task_id(task_id);

if (result->is_error()) {
res_error(res, result->to_json());
return;
}

auto res_task = dynamic_cast<server_task_result_metrics *>(result.get());
GGML_ASSERT(res_task != nullptr);

json slots_status = json::array(); // build response with only safe data
int n_idle = 0;
int n_processing = 0;

for (const auto &slot_data : res_task->slots_data) {
json secure_slot;

secure_slot["id"] = slot_data["id"];
secure_slot["state"] = slot_data["state"];
secure_slot["is_processing"] = slot_data["is_processing"];

if (slot_data["is_processing"]) {
secure_slot["n_ctx"] = slot_data["n_ctx"];
secure_slot["n_past"] = slot_data["n_past"];
secure_slot["n_decoded"] = slot_data["n_decoded"];
secure_slot["n_remaining"] = slot_data["n_remaining"];
secure_slot["truncated"] = slot_data["truncated"];
n_processing++;
} else {
n_idle++;
}

slots_status.push_back(secure_slot);
}

json response;
response["slots"] = slots_status;
response["total_slots"] = res_task->slots_data.size();
response["idle_slots"] = n_idle;
response["processing_slots"] = n_processing;
response["queue_size"] = res_task->n_tasks_deferred;
response["server_uptime_ms"] = (ggml_time_us() - res_task->t_start) / 1000.0;

res_ok(res, response);
};


const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
if (!params.endpoint_metrics) {
res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
Expand Down Expand Up @@ -5596,6 +5658,7 @@ int main(int argc, char ** argv) {
svr->Post(params.api_prefix + "/lora-adapters", handle_lora_adapters_apply);
// Save & load slots
svr->Get (params.api_prefix + "/slots", handle_slots);
svr->Get(params.api_prefix + "/slots/status", handle_slots_status);
svr->Post(params.api_prefix + "/slots/:id_slot", handle_slots_action);

//
Expand Down
34 changes: 34 additions & 0 deletions tools/server/tests/test_slots_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pytest
from utils import *

# Test the new secure slots monitoring endpoint
def test_slots_status_basic():
global server
res = server.make_request("GET", "/slots/status")
assert res.status_code == 200

def test_slots_status_structure():
global server
res = server.make_request("GET", "/slots/status")
data = res.body

# Check response structure
assert "slots" in data
assert "total_slots" in data
assert "idle_slots" in data
assert "processing_slots" in data
assert isinstance(data["slots"], list)

def test_slots_status_no_sensitive_data():
"""Critical security test: ensure no sensitive data leakage"""
global server
res = server.make_request("GET", "/slots/status")
data = res.body

for slot in data["slots"]:
# These fields must NEVER appear in the response
assert "prompt" not in slot
assert "generated_text" not in slot
assert "generated" not in slot
assert "tokens" not in slot
assert "cache_tokens" not in slot