3131#include  " json-schema-to-grammar.mjs.hpp" 
3232
3333#include  < atomic> 
34- #include  < chrono> 
3534#include  < condition_variable> 
3635#include  < cstddef> 
3736#include  < cinttypes> 
4342#include  < unordered_map> 
4443#include  < unordered_set> 
4544
46- #define  SLT_INF (fmt, ...) LOG_INF(" slot %12.*s: id %2d | task %d | " 12 , __func__, id, id_task, __VA_ARGS__)
47- #define  SLT_WRN (fmt, ...) LOG_WRN(" slot %12.*s: id %2d | task %d | " 12 , __func__, id, id_task, __VA_ARGS__)
48- #define  SLT_ERR (fmt, ...) LOG_ERR(" slot %12.*s: id %2d | task %d | " 12 , __func__, id, id_task, __VA_ARGS__)
49- #define  SLT_DBG (fmt, ...) LOG_DBG(" slot %12.*s: id %2d | task %d | " 12 , __func__, id, id_task, __VA_ARGS__)
45+ #define  SLT_INF (slot,  fmt, ...) LOG_INF(" slot %12.*s: id %2d | task %d | " 12 , __func__, (slot). id, (slot). id_task, __VA_ARGS__)
46+ #define  SLT_WRN (slot,  fmt, ...) LOG_WRN(" slot %12.*s: id %2d | task %d | " 12 , __func__, (slot). id, (slot). id_task, __VA_ARGS__)
47+ #define  SLT_ERR (slot,  fmt, ...) LOG_ERR(" slot %12.*s: id %2d | task %d | " 12 , __func__, (slot). id, (slot). id_task, __VA_ARGS__)
48+ #define  SLT_DBG (slot,  fmt, ...) LOG_DBG(" slot %12.*s: id %2d | task %d | " 12 , __func__, (slot). id, (slot). id_task, __VA_ARGS__)
5049
5150#define  SRV_INF (fmt, ...) LOG_INF(" srv  %12.*s: " 12 , __func__, __VA_ARGS__)
5251#define  SRV_WRN (fmt, ...) LOG_WRN(" srv  %12.*s: " 12 , __func__, __VA_ARGS__)
@@ -210,7 +209,7 @@ struct server_slot {
210209    std::function<void (int )> callback_on_release;
211210
212211    void  reset () {
213-         SLT_DBG (" %s" " \n " 
212+         SLT_DBG (* this ,  " %s" " \n " 
214213
215214        n_prompt_tokens    = 0 ;
216215        generated_text     = " " 
@@ -251,15 +250,15 @@ struct server_slot {
251250
252251    void  add_token (const  completion_token_output & token) {
253252        if  (!is_processing ()) {
254-             SLT_WRN (" %s" " slot is not processing\n " 
253+             SLT_WRN (* this ,  " %s" " slot is not processing\n " 
255254            return ;
256255        }
257256        generated_token_probs.push_back (token);
258257    }
259258
260259    void  release () {
261260        if  (is_processing ()) {
262-             SLT_INF (" stop processing: n_past = %d, truncated = %d\n " 
261+             SLT_INF (* this ,  " stop processing: n_past = %d, truncated = %d\n " 
263262
264263            t_token_generation = (ggml_time_us () - t_start_generation) / 1e3 ;
265264            state = SLOT_STATE_IDLE;
@@ -316,7 +315,8 @@ struct server_slot {
316315        const  double  t_gen        =       t_token_generation / n_decoded;
317316        const  double  n_gen_second = 1e3  / t_token_generation * n_decoded;
318317
319-         SLT_INF (" \n " 
318+         SLT_INF (*this ,
319+                 " \n " 
320320                " \r prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n " 
321321                " \r        eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n " 
322322                " \r       total time = %10.2f ms / %5d tokens\n " 
@@ -694,7 +694,7 @@ struct server_context {
694694            slot.n_ctx  = n_ctx_slot;
695695            slot.n_predict  = params.n_predict ;
696696
697-             SRV_INF ( " new slot, id_slot = %d,  n_ctx_slot = %d\n " , slot. id , slot.n_ctx );
697+             SLT_INF (slot,  " new slot  n_ctx_slot = %d\n " n_ctx );
698698
699699            const  int  ga_n = params.grp_attn_n ;
700700            const  int  ga_w = params.grp_attn_w ;
@@ -705,7 +705,7 @@ struct server_context {
705705                // GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
706706                // GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
707707
708-                 SRV_INF ( " slot self-extend: id_slot = %d,  ga_n = %d, ga_w = %d\n " , slot. id , ga_n, ga_w);
708+                 SLT_INF (slot,  " slot self-extend: ga_n = %d, ga_w = %d\n " 
709709            }
710710
711711            slot.ga_i  = 0 ;
@@ -828,7 +828,7 @@ struct server_context {
828828            }
829829
830830            if  (ret != nullptr ) {
831-                 SRV_DBG ( " selected slot by lcp similarity, id_slot = %d,  max_lcp_len = %d, similarity = %f\n " , ret-> id , max_lcp_len, similarity);
831+                 SLT_DBG (*ret,  " selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n " 
832832            }
833833        }
834834
@@ -849,7 +849,7 @@ struct server_context {
849849            }
850850
851851            if  (ret != nullptr ) {
852-                 SRV_DBG ( " selected slot by lru, id_slot = %d,  t_last = %" " \n " , ret-> id , t_last);
852+                 SLT_DBG (*ret,  " selected slot by lru, t_last = %" " \n " 
853853            }
854854        }
855855
@@ -914,13 +914,13 @@ struct server_context {
914914
915915        if  (slot.params .cache_prompt  && slot.ga_n  != 1 ) {
916916            slot.params .cache_prompt  = false ;
917-             SRV_WRN ( " slot %d:  group-attention is not supported with prompt caching. disabling cache" , slot. id );
917+             SLT_WRN ( slot,  " %s " ,  " group-attention is not supported with prompt caching. disabling cache\n " 
918918        }
919919
920920        if  (slot.n_predict  > 0  && slot.params .n_predict  > slot.n_predict ) {
921921            //  Might be better to reject the request with a 400 ?
922922            slot.params .n_predict  = slot.n_predict ;
923-             SRV_WRN ( " slot %d: n_predict  exceeds server configuration, setting to %d" id , slot.n_predict );
923+             SLT_WRN ( slot,  " n_predict = %d  exceeds server configuration, setting to %d" n_predict , slot.n_predict );
924924        }
925925
926926        //  infill
@@ -1029,7 +1029,7 @@ struct server_context {
10291029        slot.state  = SLOT_STATE_PROCESSING_PROMPT;
10301030        slot.prompt_tokens .clear ();
10311031
1032-         SRV_INF ( " processing task, slot id = %d, task id = %d \n " , slot. id , slot. id_task );
1032+         SLT_INF (slot,  " %s " ,  " processing task \n " 
10331033
10341034        return  true ;
10351035    }
@@ -1164,54 +1164,30 @@ struct server_context {
11641164            slot.stopped_limit   = true ;
11651165            slot.has_next_token  = false ;
11661166
1167-             SRV_DBG ( " slot %d, task %d:  stopped by limit, n_decoded = %d, n_predict = %d\n " , slot. id , slot. id_task , slot.n_decoded , slot.params .n_predict );
1167+             SLT_DBG ( slot,  " stopped by limit, n_decoded = %d, n_predict = %d\n " n_decoded , slot.params .n_predict );
11681168        }
11691169
11701170        if  (llama_token_is_eog (model, result.tok )) {
11711171            slot.stopped_eos     = true ;
11721172            slot.has_next_token  = false ;
11731173
1174-             SRV_DBG ( " slot %d, task %d:  stopped by EOS\n " , slot. id , slot. id_task );
1174+             SLT_DBG ( slot,  " %s " ,  " stopped by EOS\n " 
11751175        }
11761176
11771177        const  auto  n_ctx_train = llama_n_ctx_train (model);
11781178
11791179        if  (slot.params .n_predict  < 1  && slot.n_predict  < 1  && slot.ga_n  == 1  && slot.n_prompt_tokens  + slot.n_decoded  >= n_ctx_train) {
1180-             // LOG_WARNING("n_predict is not set and self-context extend is disabled."
1181-             //             " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
1182-             //         { "id_slot",              slot.id },
1183-             //         { "params.n_predict",     slot.params.n_predict },
1184-             //         { "slot.n_prompt_tokens", slot.n_prompt_tokens },
1185-             //         { "slot.n_decoded",       slot.n_decoded },
1186-             //         { "slot.n_predict",       slot.n_predict },
1187-             //         { "n_slots",              params.n_parallel },
1188-             //         { "slot.n_ctx",           slot.n_ctx },
1189-             //         { "n_ctx",                n_ctx },
1190-             //         { "n_ctx_train",          n_ctx_train },
1191-             //         { "ga_n",                 slot.ga_n },
1192-             //     });
11931180            slot.truncated       = true ;
11941181            slot.stopped_limit   = true ;
11951182            slot.has_next_token  = false ; //  stop prediction
11961183
1197-             SRV_WRN (" slot %d, task %d: n_predict is not set and self-context extend is disabled." 
1198-                     "  Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop\n " id , slot.id_task );
1199-         }
1200- 
1201-         // LOG_VERBOSE("next token", {
1202-         //     {"id_slot",        slot.id},
1203-         //     {"id_task",        slot.id_task},
1204-         //     {"token",          result.tok},
1205-         //     {"token_text",     tokens_to_output_formatted_string(ctx, result.tok)},
1206-         //     {"has_next_token", slot.has_next_token},
1207-         //     {"n_remain",       slot.n_remaining},
1208-         //     {"n_decoded",      slot.n_decoded},
1209-         //     {"stopped_eos",    slot.stopped_eos},
1210-         //     {"stopped_word",   slot.stopped_word},
1211-         //     {"stopped_limit",  slot.stopped_limit},
1212-         //     {"stopping_word",  slot.stopping_word},
1213-         // });
1214-         SRV_DBG (" slot %d, task %d, n_decoded = %d, n_remaining = %d, next token: '%s'\n " id , slot.id_task , slot.n_decoded , slot.n_remaining , token_str.c_str ());
1184+             SLT_WRN (slot,
1185+                     " n_predict (%d) is not set and self-context extend is disabled. " 
1186+                     " Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n " 
1187+                     slot.params .n_predict , n_ctx_train);
1188+         }
1189+ 
1190+         SLT_DBG (slot, " n_decoded = %d, n_remaining = %d, next token: '%s'\n " n_decoded , slot.n_remaining , token_str.c_str ());
12151191
12161192        return  slot.has_next_token ; //  continue
12171193    }
@@ -1387,7 +1363,7 @@ struct server_context {
13871363            }
13881364
13891365            if  (embd == NULL ) {
1390-                 SRV_ERR ( " failed to get embeddings, token = %d, seq_id = %d\n " token [i], batch.seq_id [i][0 ]);
1366+                 SLT_ERR (slot,  " failed to get embeddings, token = %d, seq_id = %d\n " token [i], batch.seq_id [i][0 ]);
13911367
13921368                res.data  = json {
13931369                    {" embedding" float >(n_embd, 0 .0f )},
@@ -1404,6 +1380,8 @@ struct server_context {
14041380            };
14051381        }
14061382
1383+         SLT_DBG (slot, " %s" " sending embeddings\n " 
1384+ 
14071385        queue_results.send (res);
14081386    }
14091387
@@ -1841,7 +1819,7 @@ struct server_context {
18411819                    const  int  n_left    = (int ) system_tokens.size () + slot.n_past  - n_keep;
18421820                    const  int  n_discard = slot.params .n_discard  ? slot.params .n_discard  : (n_left / 2 );
18431821
1844-                     SRV_WRN ( " slot context shift, id_slot = %d, id_task = %d,  n_keep = %d, n_left = %d, n_discard = %d\n " , slot. id , slot. id_task , n_keep, n_left, n_discard);
1822+                     SLT_WRN (slot,  " slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n " 
18451823
18461824                    llama_kv_cache_seq_rm  (ctx, slot.id  + 1 , n_keep            , n_keep + n_discard);
18471825                    llama_kv_cache_seq_add (ctx, slot.id  + 1 , n_keep + n_discard, system_tokens.size () + slot.n_past , -n_discard);
@@ -1884,7 +1862,8 @@ struct server_context {
18841862                slot.cache_tokens .push_back (slot.sampled );
18851863            }
18861864
1887-             SRV_DBG (" slot decode token, id_slot = %d, id_task = %d, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n " id , slot.id_task , slot.n_ctx , slot.n_past , (int ) system_tokens.size (), (int ) slot.cache_tokens .size (), slot.truncated );
1865+             SLT_DBG (slot, " slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n " 
1866+                     slot.n_ctx , slot.n_past , (int ) system_tokens.size (), (int ) slot.cache_tokens .size (), slot.truncated );
18881867        }
18891868
18901869        //  process in chunks of params.n_batch
@@ -1905,7 +1884,7 @@ struct server_context {
19051884
19061885                    //  we haven't tokenized the prompt yet - do it now:
19071886                    if  (prompt_tokens.empty ()) {
1908-                         SRV_INF ( " tokenizing prompt, id_slot  = %d, id_task = %d \n " slot. id ,  slot.id_task );
1887+                         SLT_INF (slot,  " tokenizing prompt, len  = %d\n " ( int )  slot.prompt . size () );
19091888
19101889                        slot.t_start_process_prompt  = ggml_time_us ();
19111890                        slot.t_start_generation  = 0 ;
@@ -1949,11 +1928,11 @@ struct server_context {
19491928                        slot.n_past  = 0 ;
19501929                        slot.n_prompt_tokens  = prompt_tokens.size ();
19511930
1952-                         SRV_INF ( " prompt tokenized, id_slot = %d, id_task = %d,  n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n " , slot. id , slot. id_task , slot.n_ctx , slot.params .n_keep , slot.n_prompt_tokens );
1931+                         SLT_INF (slot,  " prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n " n_ctx , slot.params .n_keep , slot.n_prompt_tokens );
19531932
19541933                        //  empty prompt passed -> release the slot and send empty response
19551934                        if  (prompt_tokens.empty ()) {
1956-                             SRV_WRN ( " empty prompt - releasing slot, id_slot = %d, id_task = %d \n " , slot. id , slot. id_task );
1935+                             SLT_WRN (slot,  " %s " ,  " empty prompt - releasing slot\n " 
19571936
19581937                            slot.release ();
19591938                            slot.print_timings ();
@@ -1995,7 +1974,7 @@ struct server_context {
19951974                                slot.truncated  = true ;
19961975                                slot.n_prompt_tokens  = prompt_tokens.size ();
19971976
1998-                                 SRV_WRN ( " input truncated, id_slot = %d, id_task = %d,  n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n " , slot. id , slot. id_task , slot.n_ctx , slot.params .n_keep , n_left, slot.n_prompt_tokens );
1977+                                 SLT_WRN (slot,  " input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n " n_ctx , slot.params .n_keep , n_left, slot.n_prompt_tokens );
19991978
20001979                                GGML_ASSERT (slot.n_prompt_tokens  < slot.n_ctx );
20011980                            }
@@ -2020,7 +1999,7 @@ struct server_context {
20201999
20212000                        if  (slot.n_past  == slot.n_prompt_tokens  && slot.n_past  > 0 ) {
20222001                            //  we have to evaluate at least 1 token to generate logits.
2023-                             SRV_WRN ( " need to evaluate at least 1 token to generate logits, id_slot  = %d, id_task  = %d\n " id , slot.id_task );
2002+                             SLT_WRN (slot,  " need to evaluate at least 1 token to generate logits, n_past  = %d, n_prompt_tokens  = %d\n " n_past , slot.n_prompt_tokens );
20242003
20252004                            slot.n_past --;
20262005                            if  (slot.ga_i  > 0 ) {
@@ -2069,7 +2048,7 @@ struct server_context {
20692048                    //  remove the non-common part from the cache
20702049                    slot.cache_tokens .resize (slot.n_past );
20712050
2072-                     SRV_INF ( " kv cache rm [%d, end), id_slot = %d, id_task = %d \n " , slot. id , slot. id_task );
2051+                     SLT_INF (slot,  " kv cache rm [%d, end)\n " 
20732052
20742053                    int32_t  slot_npast = slot.n_past_se  > 0  ? slot.n_past_se  : slot.n_past ;
20752054
@@ -2105,7 +2084,7 @@ struct server_context {
21052084                    //     {"n_tokens", batch.n_tokens},
21062085                    //     {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
21072086                    // });
2108-                     SRV_INF ( " prompt processing progress, id_slot = %d,  n_past = %d, n_tokens = %d, progress = %f\n " , slot. id , slot.n_past , batch.n_tokens , (float ) slot.n_prompt_tokens_processed  / slot.n_prompt_tokens );
2087+                     SLT_INF (slot,  " prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n " n_past , batch.n_tokens , (float ) slot.n_prompt_tokens_processed  / slot.n_prompt_tokens );
21092088
21102089                    //  entire prompt has been processed
21112090                    if  (slot.n_past  == slot.n_prompt_tokens ) {
@@ -2125,7 +2104,7 @@ struct server_context {
21252104                        //     {"n_ctx",    n_ctx},
21262105                        //     {"n_tokens", batch.n_tokens},
21272106                        // });
2128-                         SRV_INF ( " prompt done, id_slot = %d,  n_past = %d, n_tokens = %d\n " , slot. id , slot.n_past , batch.n_tokens );
2107+                         SLT_INF (slot,  " prompt done, n_past = %d, n_tokens = %d\n " n_past , batch.n_tokens );
21292108                    }
21302109                }
21312110
@@ -2158,9 +2137,9 @@ struct server_context {
21582137                        const  int  bd = (slot.ga_w  / slot.ga_n ) * (slot.ga_n  - 1 );
21592138                        const  int  dd = (slot.ga_w  / slot.ga_n ) - ib * bd - slot.ga_w ;
21602139
2161-                         SRV_DBG ( " shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " ga_i , slot.n_past_se , ib * bd, slot.ga_i  + ib * bd, slot.n_past_se  + ib * bd);
2162-                         SRV_DBG ( " div:   [%6d, %6d] / %6d -> [%6d, %6d]\n " ga_i  + ib * bd, slot.ga_i  + ib * bd + slot.ga_w , slot.ga_n , (slot.ga_i  + ib * bd) / slot.ga_n , (slot.ga_i  + ib * bd + slot.ga_w ) / slot.ga_n );
2163-                         SRV_DBG ( " shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " ga_i  + ib * bd + slot.ga_w , slot.n_past_se  + ib * bd, dd, slot.ga_i  + ib * bd + slot.ga_w  + dd, slot.n_past_se  + ib * bd + dd);
2140+                         SLT_DBG (slot,  " shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " ga_i , slot.n_past_se , ib * bd, slot.ga_i  + ib * bd, slot.n_past_se  + ib * bd);
2141+                         SLT_DBG (slot,  " div:   [%6d, %6d] / %6d -> [%6d, %6d]\n " ga_i  + ib * bd, slot.ga_i  + ib * bd + slot.ga_w , slot.ga_n , (slot.ga_i  + ib * bd) / slot.ga_n , (slot.ga_i  + ib * bd + slot.ga_w ) / slot.ga_n );
2142+                         SLT_DBG (slot,  " shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " ga_i  + ib * bd + slot.ga_w , slot.n_past_se  + ib * bd, dd, slot.ga_i  + ib * bd + slot.ga_w  + dd, slot.n_past_se  + ib * bd + dd);
21642143
21652144                        llama_kv_cache_seq_add (ctx, slot.id  + 1 , slot.ga_i , slot.n_past_se , ib * bd);
21662145                        llama_kv_cache_seq_div (ctx, slot.id  + 1 , slot.ga_i  + ib * bd, slot.ga_i  + ib * bd + slot.ga_w , slot.ga_n );
@@ -2170,7 +2149,7 @@ struct server_context {
21702149
21712150                        slot.ga_i  += slot.ga_w  / slot.ga_n ;
21722151
2173-                         SRV_DBG ( " \n n_past_old = %d, n_past = %d, ga_i = %d\n\n " n_past_se  + bd, slot.n_past_se , slot.ga_i );
2152+                         SLT_DBG (slot,  " \n n_past_old = %d, n_past = %d, ga_i = %d\n\n " n_past_se  + bd, slot.n_past_se , slot.ga_i );
21742153                    }
21752154
21762155                    slot.n_past_se  += n_tokens;
0 commit comments