@@ -623,12 +623,6 @@ struct server_context {
623623
624624 int32_t n_ctx; // total context for all clients / slots
625625
626- // system prompt
627- bool system_need_update = false ;
628-
629- std::string system_prompt;
630- std::vector<llama_token> system_tokens;
631-
632626 // slots / clients
633627 std::vector<server_slot> slots;
634628 json default_generation_settings_for_props;
@@ -665,7 +659,7 @@ struct server_context {
665659 bool load_model (const common_params & params_) {
666660 params = params_;
667661
668- // dedicate one sequence to the system prompt
662+ // reserve one extra sequence (seq_id == 0) for extra features
669663 params.n_parallel += 1 ;
670664
671665 common_init_result llama_init = common_init_from_params (params);
@@ -1061,51 +1055,6 @@ struct server_context {
10611055 clean_kv_cache = false ;
10621056 }
10631057
1064- void system_prompt_update () {
1065- SRV_DBG (" updating system prompt: '%s'\n " , system_prompt.c_str ());
1066-
1067- kv_cache_clear ();
1068- system_tokens.clear ();
1069-
1070- if (!system_prompt.empty ()) {
1071- system_tokens = common_tokenize (ctx, system_prompt, true );
1072-
1073- const int32_t n_batch = llama_n_batch (ctx);
1074- const int32_t n_tokens_prompt = system_tokens.size ();
1075-
1076- for (int32_t i = 0 ; i < n_tokens_prompt; i += n_batch) {
1077- const int32_t n_tokens = std::min (n_batch, n_tokens_prompt - i);
1078-
1079- common_batch_clear (batch);
1080-
1081- for (int32_t j = 0 ; j < n_tokens; ++j) {
1082- common_batch_add (batch, system_tokens[i + j], i + j, { 0 }, false );
1083- }
1084-
1085- if (llama_decode (ctx, batch) != 0 ) {
1086- SRV_ERR (" %s" , " llama_decode() failed\n " );
1087- return ;
1088- }
1089- }
1090-
1091- // assign the system KV cache to all parallel sequences
1092- for (int32_t i = 1 ; i <= params.n_parallel ; ++i) {
1093- llama_kv_cache_seq_cp (ctx, 0 , i, -1 , -1 );
1094- }
1095- }
1096-
1097- system_need_update = false ;
1098- }
1099-
1100- bool system_prompt_set (const std::string & sys_prompt) {
1101- SRV_DBG (" system prompt set: '%s'\n " , system_prompt.c_str ());
1102-
1103- system_prompt = sys_prompt;
1104- // update system_tokens and KV cache as soon as all slots are idle
1105- system_need_update = true ;
1106- return true ;
1107- }
1108-
11091058 bool process_token (completion_token_output & result, server_slot & slot) {
11101059 // remember which tokens were sampled - used for repetition penalties during sampling
11111060 const std::string token_str = common_token_to_piece (ctx, result.tok , params.special );
@@ -1855,12 +1804,8 @@ struct server_context {
18551804 }
18561805
18571806 if (all_idle) {
1858- if (system_need_update) {
1859- system_prompt_update ();
1860- }
1861-
18621807 SRV_INF (" %s" , " all slots are idle\n " );
1863- if (system_prompt. empty () && clean_kv_cache) {
1808+ if (clean_kv_cache) {
18641809 kv_cache_clear ();
18651810 }
18661811
@@ -1882,7 +1827,7 @@ struct server_context {
18821827 // TODO: simplify and improve
18831828 for (server_slot & slot : slots) {
18841829 if (slot.ga_n == 1 ) {
1885- if (slot.is_processing () && ( int ) system_tokens. size () + slot.n_past >= slot.n_ctx - 1 ) {
1830+ if (slot.is_processing () && slot.n_past >= slot.n_ctx - 1 ) {
18861831 if (!params.ctx_shift ) {
18871832 // this check is redundant (for good)
18881833 // we should never get here, because generation should already stopped in process_token()
@@ -1893,13 +1838,13 @@ struct server_context {
18931838
18941839 // Shift context
18951840 const int n_keep = slot.params .n_keep + add_bos_token;
1896- const int n_left = ( int ) system_tokens. size () + slot.n_past - n_keep;
1841+ const int n_left = slot.n_past - n_keep;
18971842 const int n_discard = slot.params .n_discard ? slot.params .n_discard : (n_left / 2 );
18981843
18991844 SLT_WRN (slot, " slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n " , n_keep, n_left, n_discard);
19001845
19011846 llama_kv_cache_seq_rm (ctx, slot.id + 1 , n_keep , n_keep + n_discard);
1902- llama_kv_cache_seq_add (ctx, slot.id + 1 , n_keep + n_discard, system_tokens. size () + slot.n_past , -n_discard);
1847+ llama_kv_cache_seq_add (ctx, slot.id + 1 , n_keep + n_discard, slot.n_past , -n_discard);
19031848
19041849 if (slot.params .cache_prompt ) {
19051850 for (size_t i = n_keep + n_discard; i < slot.cache_tokens .size (); i++) {
@@ -1929,18 +1874,16 @@ struct server_context {
19291874
19301875 const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past ;
19311876
1932- // TODO: we always have to take into account the "system_tokens"
1933- // this is not great and needs to be improved somehow
1934- common_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id + 1 }, true );
1877+ common_batch_add (batch, slot.sampled , slot_npast, { slot.id + 1 }, true );
19351878
19361879 slot.n_past += 1 ;
19371880
19381881 if (slot.params .cache_prompt ) {
19391882 slot.cache_tokens .push_back (slot.sampled );
19401883 }
19411884
1942- SLT_DBG (slot, " slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n " ,
1943- slot.n_ctx , slot.n_past , (int ) system_tokens. size (), ( int ) slot.cache_tokens .size (), slot.truncated );
1885+ SLT_DBG (slot, " slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n " ,
1886+ slot.n_ctx , slot.n_past , (int ) slot.cache_tokens .size (), slot.truncated );
19441887 }
19451888
19461889 // process in chunks of params.n_batch
@@ -1971,7 +1914,7 @@ struct server_context {
19711914 case SERVER_TASK_CMPL_TYPE_NORMAL:
19721915 case SERVER_TASK_CMPL_TYPE_EMBEDDING:
19731916 {
1974- prompt_tokens = tokenize (slot.prompt , system_prompt. empty ( ), true ); // add BOS if there isn't system prompt
1917+ prompt_tokens = tokenize (slot.prompt , llama_add_bos_token (model ), true );
19751918 } break ;
19761919 case SERVER_TASK_CMPL_TYPE_RERANK:
19771920 {
@@ -2050,7 +1993,7 @@ struct server_context {
20501993 } else {
20511994 if (!params.ctx_shift ) {
20521995 // if context shift is disabled, we make sure prompt size is smaller than KV size
2053- if (( int ) system_tokens. size () + slot.n_prompt_tokens >= slot.n_ctx ) {
1996+ if (slot.n_prompt_tokens >= slot.n_ctx ) {
20541997 slot.release ();
20551998 send_error (slot, " the request exceeds the available context size. try increasing the context size or enable context shift" , ERROR_TYPE_INVALID_REQUEST);
20561999 continue ;
@@ -2138,22 +2081,16 @@ struct server_context {
21382081 }
21392082
21402083 // keep only the common part
2141- int p0 = ( int ) system_tokens. size () + slot.n_past ;
2084+ int p0 = slot.n_past ;
21422085 if (!llama_kv_cache_seq_rm (ctx, slot.id + 1 , p0, -1 )) {
21432086 // could not partially delete (likely using a non-Transformer model)
21442087 llama_kv_cache_seq_rm (ctx, slot.id + 1 , -1 , -1 );
21452088
2146- p0 = (int ) system_tokens.size ();
2147- if (p0 != 0 ) {
2148- // copy over the system prompt when there is one
2149- llama_kv_cache_seq_cp (ctx, 0 , slot.id + 1 , -1 , -1 );
2150- }
2151-
2152- // there is no common part left (except for the system prompt)
2089+ // there is no common part left
21532090 slot.n_past = 0 ;
21542091 slot.n_past_se = 0 ;
21552092 slot.ga_i = 0 ;
2156- // TODO: is the system prompt ever in the sampling context?
2093+
21572094 common_sampler_reset (slot.smpl );
21582095 }
21592096
@@ -2179,7 +2116,7 @@ struct server_context {
21792116 }
21802117 }
21812118
2182- common_batch_add (batch, prompt_tokens[slot.n_past ], system_tokens. size () + slot_npast, { slot.id + 1 }, false );
2119+ common_batch_add (batch, prompt_tokens[slot.n_past ], slot_npast, { slot.id + 1 }, false );
21832120
21842121 if (slot.params .cache_prompt ) {
21852122 slot.cache_tokens .push_back (prompt_tokens[slot.n_past ]);
@@ -2409,10 +2346,6 @@ int main(int argc, char ** argv) {
24092346 // struct that contains llama context and inference
24102347 server_context ctx_server;
24112348
2412- if (!params.system_prompt .empty ()) {
2413- ctx_server.system_prompt_set (params.system_prompt );
2414- }
2415-
24162349 if (params.model_alias == " unknown" ) {
24172350 params.model_alias = params.model ;
24182351 }
@@ -2840,7 +2773,7 @@ int main(int argc, char ** argv) {
28402773
28412774 const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
28422775 json data = {
2843- { " system_prompt" , ctx_server. system_prompt },
2776+ { " system_prompt" , " [unavailable] " },
28442777 { " default_generation_settings" , ctx_server.default_generation_settings_for_props },
28452778 { " total_slots" , ctx_server.params .n_parallel },
28462779 { " chat_template" , llama_get_chat_template (ctx_server.model ) },
@@ -2856,10 +2789,8 @@ int main(int argc, char ** argv) {
28562789 }
28572790
28582791 json data = json::parse (req.body );
2859- if (data.contains (" system_prompt" )) {
2860- std::string system_prompt = data.at (" system_prompt" );
2861- ctx_server.system_prompt_set (system_prompt);
2862- }
2792+
2793+ // update any props here
28632794
28642795 res_ok (res, {{ " success" , true }});
28652796 };
0 commit comments