@@ -2009,75 +2009,75 @@ struct server_context {
20092009 }
20102010
20112011 slot.n_prompt_tokens_processed = 0 ;
2012- }
20132012
2014- // non-causal tasks require to fit the entire prompt in the physical batch
2015- if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
2016- // cannot fit the prompt in the current batch - will try next iter
2017- if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
2018- continue ;
2013+ // non-causal tasks require to fit the entire prompt in the physical batch
2014+ if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING || slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
2015+ // cannot fit the prompt in the current batch - will try next iter
2016+ if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
2017+ continue ;
2018+ }
20192019 }
2020- }
20212020
2022- // check that we are in the right batch_type, if not defer the slot
2023- const bool slot_type =
2024- slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING ||
2025- slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0 ;
2021+ // check that we are in the right batch_type, if not defer the slot
2022+ const bool slot_type =
2023+ slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING ||
2024+ slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK ? 1 : 0 ;
20262025
2027- if (batch_type == -1 ) {
2028- batch_type = slot_type;
2029- } else if (batch_type != slot_type) {
2030- continue ;
2031- }
2026+ if (batch_type == -1 ) {
2027+ batch_type = slot_type;
2028+ } else if (batch_type != slot_type) {
2029+ continue ;
2030+ }
20322031
2033- // keep only the common part
2034- if (!llama_kv_cache_seq_rm (ctx, slot.id + 1 , slot.n_past , -1 )) {
2035- // could not partially delete (likely using a non-Transformer model)
2036- llama_kv_cache_seq_rm (ctx, slot.id + 1 , -1 , -1 );
2032+ // keep only the common part
2033+ if (!llama_kv_cache_seq_rm (ctx, slot.id + 1 , slot.n_past , -1 )) {
2034+ // could not partially delete (likely using a non-Transformer model)
2035+ llama_kv_cache_seq_rm (ctx, slot.id + 1 , -1 , -1 );
20372036
2038- // there is no common part left
2039- slot.n_past = 0 ;
2037+ // there is no common part left
2038+ slot.n_past = 0 ;
20402039
2041- common_sampler_reset (slot.smpl );
2042- }
2040+ common_sampler_reset (slot.smpl );
2041+ }
20432042
2044- SLT_INF (slot, " kv cache rm [%d, end)\n " , slot.n_past );
2043+ SLT_INF (slot, " kv cache rm [%d, end)\n " , slot.n_past );
20452044
2046- // remove the non-common part from the cache
2047- slot.cache_tokens .resize (slot.n_past );
2045+ // remove the non-common part from the cache
2046+ slot.cache_tokens .resize (slot.n_past );
20482047
2049- // add prompt tokens for processing in the current batch
2050- while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
2051- common_batch_add (batch, slot.prompt_tokens [slot.n_past ], slot.n_past , { slot.id + 1 }, false );
2048+ // add prompt tokens for processing in the current batch
2049+ while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
2050+ common_batch_add (batch, slot.prompt_tokens [slot.n_past ], slot.n_past , { slot.id + 1 }, false );
20522051
2053- if (slot.params .cache_prompt ) {
2054- slot.cache_tokens .push_back (slot.prompt_tokens [slot.n_past ]);
2055- }
2052+ if (slot.params .cache_prompt ) {
2053+ slot.cache_tokens .push_back (slot.prompt_tokens [slot.n_past ]);
2054+ }
20562055
2057- slot.n_prompt_tokens_processed ++;
2058- slot.n_past ++;
2059- }
2056+ slot.n_prompt_tokens_processed ++;
2057+ slot.n_past ++;
2058+ }
20602059
2061- SLT_INF (slot, " prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n " , slot.n_past , batch.n_tokens , (float ) slot.n_prompt_tokens_processed / slot.n_prompt_tokens );
2060+ SLT_INF (slot, " prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n " , slot.n_past , batch.n_tokens , (float ) slot.n_prompt_tokens_processed / slot.n_prompt_tokens );
20622061
2063- // entire prompt has been processed
2064- if (slot.n_past == slot.n_prompt_tokens ) {
2065- slot.state = SLOT_STATE_DONE_PROMPT;
2062+ // entire prompt has been processed
2063+ if (slot.n_past == slot.n_prompt_tokens ) {
2064+ slot.state = SLOT_STATE_DONE_PROMPT;
20662065
2067- GGML_ASSERT (batch.n_tokens > 0 );
2066+ GGML_ASSERT (batch.n_tokens > 0 );
20682067
2069- // Process all prompt tokens through sampler system
2070- for (int i = 0 ; i < slot.n_prompt_tokens ; ++i) {
2071- common_sampler_accept (slot.smpl , slot.prompt_tokens [i], false );
2072- }
2068+ // Process all prompt tokens through sampler system
2069+ for (int i = 0 ; i < slot.n_prompt_tokens ; ++i) {
2070+ common_sampler_accept (slot.smpl , slot.prompt_tokens [i], false );
2071+ }
20732072
2074- // extract the logits only for the last token
2075- batch.logits [batch.n_tokens - 1 ] = true ;
2073+ // extract the logits only for the last token
2074+ batch.logits [batch.n_tokens - 1 ] = true ;
20762075
2077- slot.n_decoded = 0 ;
2078- slot.i_batch = batch.n_tokens - 1 ;
2076+ slot.n_decoded = 0 ;
2077+ slot.i_batch = batch.n_tokens - 1 ;
20792078
2080- SLT_INF (slot, " prompt done, n_past = %d, n_tokens = %d\n " , slot.n_past , batch.n_tokens );
2079+ SLT_INF (slot, " prompt done, n_past = %d, n_tokens = %d\n " , slot.n_past , batch.n_tokens );
2080+ }
20812081 }
20822082
20832083 if (batch.n_tokens >= n_batch) {
0 commit comments