Skip to content

Commit f6dd38c

Browse files
committed
server : remove legacy system_prompt feature
ggml-ci
1 parent 11ac980 commit f6dd38c

File tree

3 files changed

+17
-104
lines changed

3 files changed

+17
-104
lines changed

common/arg.cpp

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1788,23 +1788,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
17881788
params.n_threads_http = value;
17891789
}
17901790
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
1791-
add_opt(common_arg(
1792-
{"-spf", "--system-prompt-file"}, "FNAME",
1793-
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
1794-
[](common_params & params, const std::string & value) {
1795-
std::ifstream file(value);
1796-
if (!file) {
1797-
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1798-
}
1799-
std::string system_prompt;
1800-
std::copy(
1801-
std::istreambuf_iterator<char>(file),
1802-
std::istreambuf_iterator<char>(),
1803-
std::back_inserter(system_prompt)
1804-
);
1805-
params.system_prompt = system_prompt;
1806-
}
1807-
).set_examples({LLAMA_EXAMPLE_SERVER}));
18081791
add_opt(common_arg(
18091792
{"--metrics"},
18101793
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,6 @@ struct common_params {
282282
std::string hostname = "127.0.0.1";
283283
std::string public_path = ""; // NOLINT
284284
std::string chat_template = ""; // NOLINT
285-
std::string system_prompt = ""; // NOLINT
286285
bool enable_chat_template = true;
287286

288287
std::vector<std::string> api_keys;

examples/server/server.cpp

Lines changed: 17 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -623,12 +623,6 @@ struct server_context {
623623

624624
int32_t n_ctx; // total context for all clients / slots
625625

626-
// system prompt
627-
bool system_need_update = false;
628-
629-
std::string system_prompt;
630-
std::vector<llama_token> system_tokens;
631-
632626
// slots / clients
633627
std::vector<server_slot> slots;
634628
json default_generation_settings_for_props;
@@ -665,7 +659,7 @@ struct server_context {
665659
bool load_model(const common_params & params_) {
666660
params = params_;
667661

668-
// dedicate one sequence to the system prompt
662+
// reserve one extra sequence (seq_id == 0) for extra features
669663
params.n_parallel += 1;
670664

671665
common_init_result llama_init = common_init_from_params(params);
@@ -1061,51 +1055,6 @@ struct server_context {
10611055
clean_kv_cache = false;
10621056
}
10631057

1064-
void system_prompt_update() {
1065-
SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
1066-
1067-
kv_cache_clear();
1068-
system_tokens.clear();
1069-
1070-
if (!system_prompt.empty()) {
1071-
system_tokens = common_tokenize(ctx, system_prompt, true);
1072-
1073-
const int32_t n_batch = llama_n_batch(ctx);
1074-
const int32_t n_tokens_prompt = system_tokens.size();
1075-
1076-
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
1077-
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
1078-
1079-
common_batch_clear(batch);
1080-
1081-
for (int32_t j = 0; j < n_tokens; ++j) {
1082-
common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
1083-
}
1084-
1085-
if (llama_decode(ctx, batch) != 0) {
1086-
SRV_ERR("%s", "llama_decode() failed\n");
1087-
return;
1088-
}
1089-
}
1090-
1091-
// assign the system KV cache to all parallel sequences
1092-
for (int32_t i = 1; i <= params.n_parallel; ++i) {
1093-
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
1094-
}
1095-
}
1096-
1097-
system_need_update = false;
1098-
}
1099-
1100-
bool system_prompt_set(const std::string & sys_prompt) {
1101-
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
1102-
1103-
system_prompt = sys_prompt;
1104-
// update system_tokens and KV cache as soon as all slots are idle
1105-
system_need_update = true;
1106-
return true;
1107-
}
1108-
11091058
bool process_token(completion_token_output & result, server_slot & slot) {
11101059
// remember which tokens were sampled - used for repetition penalties during sampling
11111060
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
@@ -1855,12 +1804,8 @@ struct server_context {
18551804
}
18561805

18571806
if (all_idle) {
1858-
if (system_need_update) {
1859-
system_prompt_update();
1860-
}
1861-
18621807
SRV_INF("%s", "all slots are idle\n");
1863-
if (system_prompt.empty() && clean_kv_cache) {
1808+
if (clean_kv_cache) {
18641809
kv_cache_clear();
18651810
}
18661811

@@ -1882,7 +1827,7 @@ struct server_context {
18821827
// TODO: simplify and improve
18831828
for (server_slot & slot : slots) {
18841829
if (slot.ga_n == 1) {
1885-
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
1830+
if (slot.is_processing() && slot.n_past >= slot.n_ctx - 1) {
18861831
if (!params.ctx_shift) {
18871832
// this check is redundant (for good)
18881833
// we should never get here, because generation should already stopped in process_token()
@@ -1893,13 +1838,13 @@ struct server_context {
18931838

18941839
// Shift context
18951840
const int n_keep = slot.params.n_keep + add_bos_token;
1896-
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
1841+
const int n_left = slot.n_past - n_keep;
18971842
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
18981843

18991844
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
19001845

19011846
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
1902-
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
1847+
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
19031848

19041849
if (slot.params.cache_prompt) {
19051850
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -1929,18 +1874,16 @@ struct server_context {
19291874

19301875
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
19311876

1932-
// TODO: we always have to take into account the "system_tokens"
1933-
// this is not great and needs to be improved somehow
1934-
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
1877+
common_batch_add(batch, slot.sampled, slot_npast, { slot.id + 1 }, true);
19351878

19361879
slot.n_past += 1;
19371880

19381881
if (slot.params.cache_prompt) {
19391882
slot.cache_tokens.push_back(slot.sampled);
19401883
}
19411884

1942-
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
1943-
slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
1885+
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
1886+
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
19441887
}
19451888

19461889
// process in chunks of params.n_batch
@@ -1971,7 +1914,7 @@ struct server_context {
19711914
case SERVER_TASK_CMPL_TYPE_NORMAL:
19721915
case SERVER_TASK_CMPL_TYPE_EMBEDDING:
19731916
{
1974-
prompt_tokens = tokenize(slot.prompt, system_prompt.empty(), true); // add BOS if there isn't system prompt
1917+
prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true);
19751918
} break;
19761919
case SERVER_TASK_CMPL_TYPE_RERANK:
19771920
{
@@ -2050,7 +1993,7 @@ struct server_context {
20501993
} else {
20511994
if (!params.ctx_shift) {
20521995
// if context shift is disabled, we make sure prompt size is smaller than KV size
2053-
if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
1996+
if (slot.n_prompt_tokens >= slot.n_ctx) {
20541997
slot.release();
20551998
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
20561999
continue;
@@ -2138,22 +2081,16 @@ struct server_context {
21382081
}
21392082

21402083
// keep only the common part
2141-
int p0 = (int) system_tokens.size() + slot.n_past;
2084+
int p0 = slot.n_past;
21422085
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
21432086
// could not partially delete (likely using a non-Transformer model)
21442087
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
21452088

2146-
p0 = (int) system_tokens.size();
2147-
if (p0 != 0) {
2148-
// copy over the system prompt when there is one
2149-
llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
2150-
}
2151-
2152-
// there is no common part left (except for the system prompt)
2089+
// there is no common part left
21532090
slot.n_past = 0;
21542091
slot.n_past_se = 0;
21552092
slot.ga_i = 0;
2156-
// TODO: is the system prompt ever in the sampling context?
2093+
21572094
common_sampler_reset(slot.smpl);
21582095
}
21592096

@@ -2179,7 +2116,7 @@ struct server_context {
21792116
}
21802117
}
21812118

2182-
common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
2119+
common_batch_add(batch, prompt_tokens[slot.n_past], slot_npast, { slot.id + 1 }, false);
21832120

21842121
if (slot.params.cache_prompt) {
21852122
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -2409,10 +2346,6 @@ int main(int argc, char ** argv) {
24092346
// struct that contains llama context and inference
24102347
server_context ctx_server;
24112348

2412-
if (!params.system_prompt.empty()) {
2413-
ctx_server.system_prompt_set(params.system_prompt);
2414-
}
2415-
24162349
if (params.model_alias == "unknown") {
24172350
params.model_alias = params.model;
24182351
}
@@ -2840,7 +2773,7 @@ int main(int argc, char ** argv) {
28402773

28412774
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
28422775
json data = {
2843-
{ "system_prompt", ctx_server.system_prompt },
2776+
{ "system_prompt", "[unavailable]" },
28442777
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
28452778
{ "total_slots", ctx_server.params.n_parallel },
28462779
{ "chat_template", llama_get_chat_template(ctx_server.model) },
@@ -2856,10 +2789,8 @@ int main(int argc, char ** argv) {
28562789
}
28572790

28582791
json data = json::parse(req.body);
2859-
if (data.contains("system_prompt")) {
2860-
std::string system_prompt = data.at("system_prompt");
2861-
ctx_server.system_prompt_set(system_prompt);
2862-
}
2792+
2793+
// update any props here
28632794

28642795
res_ok(res, {{ "success", true }});
28652796
};

0 commit comments

Comments
 (0)