Skip to content

Commit 460ec6f

Browse files
committed
server : more explicit endpoint access settings
1 parent 96b6912 commit 460ec6f

File tree

5 files changed

+107
-102
lines changed

5 files changed

+107
-102
lines changed

common/arg.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1838,9 +1838,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
18381838
params.endpoint_metrics = true;
18391839
}
18401840
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
1841+
add_opt(llama_arg(
1842+
{"--slots"},
1843+
format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
1844+
[](gpt_params & params) {
1845+
params.endpoint_slots = true;
1846+
}
1847+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
1848+
add_opt(llama_arg(
1849+
{"--props"},
1850+
format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
1851+
[](gpt_params & params) {
1852+
params.endpoint_props = true;
1853+
}
1854+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
18411855
add_opt(llama_arg(
18421856
{"--no-slots"},
1843-
format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
1857+
"disables slots monitoring endpoint",
18441858
[](gpt_params & params) {
18451859
params.endpoint_slots = false;
18461860
}

common/common.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,10 @@ struct gpt_params {
290290
std::string ssl_file_key = ""; // NOLINT
291291
std::string ssl_file_cert = ""; // NOLINT
292292

293-
bool endpoint_slots = true;
293+
// "advanced" endpoints are disabled by default for better security
294+
bool webui = true;
295+
bool endpoint_slots = false;
296+
bool endpoint_props = false; // only control POST requests, not GET
294297
bool endpoint_metrics = false;
295298

296299
bool log_json = false;

examples/server/README.md

Lines changed: 16 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,6 @@ node index.js
380380

381381
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
382382

383-
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
384-
385383
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
386384

387385
**Response format**
@@ -519,34 +517,41 @@ Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/B
519517

520518
Takes a prefix and a suffix and returns the predicted completion as stream.
521519

522-
*Options:*
520+
*Options:*
523521

524-
`input_prefix`: Set the prefix of the code to infill.
522+
- `input_prefix`: Set the prefix of the code to infill.
523+
- `input_suffix`: Set the suffix of the code to infill.
525524

526-
`input_suffix`: Set the suffix of the code to infill.
525+
It also accepts all the options of `/completion` except `stream` and `prompt`.
527526

528-
It also accepts all the options of `/completion` except `stream` and `prompt`.
527+
### **GET** `/props`: Get server global properties.
529528

530-
- **GET** `/props`: Return current server settings.
529+
This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
531530

532531
**Response format**
533532

534533
```json
535534
{
536-
"assistant_name": "",
537-
"user_name": "",
535+
"system_prompt": "",
538536
"default_generation_settings": { ... },
539537
"total_slots": 1,
540538
"chat_template": ""
541539
}
542540
```
543541

544-
- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
545-
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
542+
- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
546543
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
547544
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
548545
- `chat_template` - the model's original Jinja2 prompt template
549546

547+
### POST `/props`: Change server global properties.
548+
549+
To use this endpoint with POST method, you need to start server with `--props`
550+
551+
*Options:*
552+
553+
- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
554+
550555
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
551556

552557
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
@@ -813,28 +818,6 @@ To know the `id` of the adapter, use GET `/lora-adapters`
813818

814819
## More examples
815820

816-
### Change system prompt on runtime
817-
818-
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
819-
820-
`prompt`: Specify a context that you want all connecting clients to respect.
821-
822-
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
823-
824-
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
825-
826-
```json
827-
{
828-
"system_prompt": {
829-
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
830-
"anti_prompt": "User:",
831-
"assistant_name": "Assistant:"
832-
}
833-
}
834-
```
835-
836-
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
837-
838821
### Interactive mode
839822

840823
Check the sample in [chat.mjs](chat.mjs).

examples/server/server.cpp

Lines changed: 59 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,12 +1106,7 @@ struct server_context {
11061106
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
11071107

11081108
system_prompt = sys_prompt;
1109-
1110-
// release all slots
1111-
for (server_slot & slot : slots) {
1112-
slot.release();
1113-
}
1114-
1109+
// update system_tokens and KV cache as soon as all slots are idle
11151110
system_need_update = true;
11161111
return true;
11171112
}
@@ -1627,16 +1622,6 @@ struct server_context {
16271622
break;
16281623
}
16291624

1630-
if (task.data.contains("system_prompt")) {
1631-
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
1632-
system_prompt_set(sys_prompt);
1633-
1634-
for (server_slot & slot : slots) {
1635-
slot.n_past = 0;
1636-
slot.n_past_se = 0;
1637-
}
1638-
}
1639-
16401625
slot->reset();
16411626

16421627
slot->id_task = task.id;
@@ -1862,10 +1847,6 @@ struct server_context {
18621847
}
18631848

18641849
void update_slots() {
1865-
if (system_need_update) {
1866-
system_prompt_update();
1867-
}
1868-
18691850
// check if all slots are idle
18701851
{
18711852
bool all_idle = true;
@@ -1878,6 +1859,10 @@ struct server_context {
18781859
}
18791860

18801861
if (all_idle) {
1862+
if (system_need_update) {
1863+
system_prompt_update();
1864+
}
1865+
18811866
SRV_INF("%s", "all slots are idle\n");
18821867
if (system_prompt.empty() && clean_kv_cache) {
18831868
kv_cache_clear();
@@ -2536,29 +2521,20 @@ int main(int argc, char ** argv) {
25362521
//
25372522

25382523
auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
2539-
// TODO: should we apply API key to all endpoints, including "/health" and "/models"?
2540-
static const std::unordered_set<std::string> protected_endpoints = {
2524+
static const std::unordered_set<std::string> public_endpoints = {
2525+
"/health",
2526+
"/models",
2527+
"/v1/models",
25412528
"/props",
2542-
"/completion",
2543-
"/completions",
2544-
"/v1/completions",
2545-
"/chat/completions",
2546-
"/v1/chat/completions",
2547-
"/infill",
2548-
"/tokenize",
2549-
"/detokenize",
2550-
"/embedding",
2551-
"/embeddings",
2552-
"/v1/embeddings",
25532529
};
25542530

25552531
// If API key is not set, skip validation
25562532
if (params.api_keys.empty()) {
25572533
return true;
25582534
}
25592535

2560-
// If path is not in protected_endpoints list, skip validation
2561-
if (protected_endpoints.find(req.path) == protected_endpoints.end()) {
2536+
// If path is public, skip validation
2537+
if (public_endpoints.find(req.path) != public_endpoints.end()) {
25622538
return true;
25632539
}
25642540

@@ -2620,7 +2596,7 @@ int main(int argc, char ** argv) {
26202596

26212597
const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
26222598
if (!params.endpoint_slots) {
2623-
res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED));
2599+
res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
26242600
return;
26252601
}
26262602

@@ -2869,24 +2845,31 @@ int main(int argc, char ** argv) {
28692845
};
28702846

28712847
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
2872-
std::string template_key = "tokenizer.chat_template", curr_tmpl;
2873-
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
2874-
if (tlen > 0) {
2875-
std::vector<char> curr_tmpl_buf(tlen + 1, 0);
2876-
if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
2877-
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
2878-
}
2879-
}
28802848
json data = {
2881-
{ "system_prompt", ctx_server.system_prompt.c_str() },
2849+
{ "system_prompt", ctx_server.system_prompt },
28822850
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
28832851
{ "total_slots", ctx_server.params.n_parallel },
2884-
{ "chat_template", curr_tmpl.c_str() },
2852+
{ "chat_template", llama_get_chat_template(ctx_server.model) },
28852853
};
28862854

28872855
res_ok(res, data);
28882856
};
28892857

2858+
const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
2859+
if (!ctx_server.params.endpoint_props) {
2860+
res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
2861+
return;
2862+
}
2863+
2864+
json data = json::parse(req.body);
2865+
if (data.contains("system_prompt")) {
2866+
std::string system_prompt = data.at("system_prompt");
2867+
ctx_server.system_prompt_set(system_prompt);
2868+
}
2869+
2870+
res_ok(res, {{ "success", true }});
2871+
};
2872+
28902873
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
28912874
if (ctx_server.params.embedding || ctx_server.params.reranking) {
28922875
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
@@ -3265,30 +3248,39 @@ int main(int argc, char ** argv) {
32653248
svr->set_base_dir(params.public_path);
32663249
}
32673250

3268-
// using embedded static files
3269-
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
3270-
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
3271-
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
3272-
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
3273-
3274-
// add new-ui files
3275-
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
3276-
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
3277-
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
3278-
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
3279-
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
3280-
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
3281-
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
3282-
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
3283-
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
3284-
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
3285-
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
3251+
if (!params.api_keys.empty()) {
3252+
// for now, if API key is set, web UI is unusable
3253+
svr->Get("/", [&](const httplib::Request & req, httplib::Response & res) {
3254+
return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
3255+
});
3256+
} else {
3257+
// using embedded static files
3258+
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
3259+
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
3260+
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
3261+
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
3262+
3263+
// add new-ui files
3264+
svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
3265+
svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
3266+
svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
3267+
svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
3268+
svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
3269+
svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
3270+
svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
3271+
svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
3272+
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
3273+
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
3274+
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
3275+
}
32863276

32873277
// register API routes
3288-
svr->Get ("/health", handle_health);
3278+
svr->Get ("/health", handle_health); // public endpoint (no API key check)
32893279
svr->Get ("/metrics", handle_metrics);
3290-
svr->Get ("/props", handle_props);
3291-
svr->Get ("/v1/models", handle_models);
3280+
svr->Get ("/props", handle_props); // public endpoint (no API key check)
3281+
svr->Post("/props", handle_props_change);
3282+
svr->Get ("/models", handle_models); // public endpoint (no API key check)
3283+
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
32923284
svr->Post("/completion", handle_completions); // legacy
32933285
svr->Post("/completions", handle_completions);
32943286
svr->Post("/v1/completions", handle_completions);

examples/server/utils.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,19 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
9090
return formatted_chat;
9191
}
9292

93+
std::string llama_get_chat_template(const struct llama_model * model) {
94+
std::string template_key = "tokenizer.chat_template";
95+
// call with NULL buffer to get the total size of the string
96+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
97+
if (res < 0) {
98+
return "";
99+
} else {
100+
std::vector<char> model_template(res, 0);
101+
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
102+
return std::string(model_template.data(), model_template.size());
103+
}
104+
}
105+
93106
//
94107
// base64 utils (TODO: move to common in the future)
95108
//

0 commit comments

Comments
 (0)