@@ -73,6 +73,8 @@ extern int num_probs_bottoms;
7373
7474extern float confidence_total;
7575
76+ extern std::vector<llama_token> last_candidates_logits;
77+
7678#define SESSIONS_FOLDER " sessions/"
7779
7880static common_params paramsDefault;
@@ -190,6 +192,7 @@ class chat
190192private:
191193
192194 llama_context * ctx = nullptr ;
195+ llama_memory_t mem = nullptr ;
193196 llama_model * model = nullptr ;
194197 common_sampler * smpl = nullptr ;
195198 const llama_vocab * vocab = nullptr ;
@@ -289,6 +292,8 @@ class chat
289292 std::string logit_bias_strings_ext_display = " " ;
290293 std::string logit_bias_strings_start_display = " " ;
291294
295+ std::string last_candidates_logits_display = " " ;
296+
292297 struct llama_perf_context_data ctx_performance_data;
293298
294299 // std::map<std::string,std::string> stats;
@@ -765,6 +770,14 @@ class chat
765770 }
766771 }
767772
773+ void get_last_candidates_logits_display () {
774+ last_candidates_logits_display.clear ();
775+
776+ for (auto logit : last_candidates_logits) {
777+ last_candidates_logits_display += std::format (" {}; " , common_token_to_piece (ctx, logit));
778+ }
779+ }
780+
768781 void params_postfill () {
769782 if (params.kv_overrides_pair .size ()) kv_override_prefill ();
770783 common_process_override_tensors (params);
@@ -1250,6 +1263,9 @@ class chat
12501263 ctx = llama_init.context .release ();
12511264 printf (" ..............CONTEXT INITIALIZED (%s)................\n " , __func__);
12521265
1266+ mem = llama_get_memory (ctx);
1267+ printf (" ..............MEM INITIALIZED (%s)................\n " , __func__);
1268+
12531269 assignThreads ();
12541270 printf (" ..............THREADS ASSIGNED (%s)................\n " , __func__);
12551271
@@ -1402,7 +1418,7 @@ class chat
14021418
14031419 // remove any "future" tokens that we might have inherited from the previous session
14041420 // llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
1405- llama_kv_self_seq_rm (ctx , -1 , n_matching_session_tokens, -1 );
1421+ llama_memory_seq_rm (mem , -1 , n_matching_session_tokens, -1 );
14061422 }
14071423
14081424 // if we will use the cache for the full prompt without reaching the end of the cache, force
@@ -1475,8 +1491,8 @@ class chat
14751491 // always keep the first token - BOS
14761492 // n_past = std::max(1, params.n_keep);
14771493 // n_past_guidance = std::max(1, params.n_keep + guidance_offset);
1478- llama_kv_self_seq_rm (ctx , 0 , params.n_keep , params.n_keep + n_discard);
1479- llama_kv_self_seq_add (ctx , 0 , params.n_keep + n_discard, n_past, -n_discard);
1494+ llama_memory_seq_rm (mem , 0 , params.n_keep , params.n_keep + n_discard);
1495+ llama_memory_seq_add (mem , 0 , params.n_keep + n_discard, n_past, -n_discard);
14801496
14811497 // insert n_left/2 tokens at the start of embd from last_n_tokens
14821498 // embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -1510,8 +1526,8 @@ class chat
15101526 const int n_left = n_past - params.n_keep ;
15111527 const int n_discard = n_left/2 ;
15121528
1513- llama_kv_self_seq_rm (ctx , 0 , params.n_keep , params.n_keep + n_discard);
1514- llama_kv_self_seq_add (ctx , 0 , params.n_keep + n_discard, n_past, -n_discard);
1529+ llama_memory_seq_rm (mem , 0 , params.n_keep , params.n_keep + n_discard);
1530+ llama_memory_seq_add (mem , 0 , params.n_keep + n_discard, n_past, -n_discard);
15151531
15161532 n_past -= n_discard;
15171533
@@ -1524,9 +1540,9 @@ class chat
15241540 const int bd = (ga_w/ga_n)*(ga_n - 1 );
15251541 const int dd = (ga_w/ga_n) - ib*bd - ga_w;
15261542
1527- llama_kv_self_seq_add (ctx , 0 , ga_i, n_past, ib*bd);
1528- llama_kv_self_seq_div (ctx , 0 , ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
1529- llama_kv_self_seq_add (ctx , 0 , ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
1543+ llama_memory_seq_add (mem , 0 , ga_i, n_past, ib*bd);
1544+ llama_memory_seq_div (mem , 0 , ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
1545+ llama_memory_seq_add (mem , 0 , ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
15301546
15311547 n_past -= bd;
15321548
@@ -1650,6 +1666,8 @@ class chat
16501666 // const llama_token id = common_sampler_sample(smpl, ctx, -1);
16511667 llama_token id = common_sampler_sample (smpl, ctx, -1 );
16521668
1669+ get_last_candidates_logits_display ();
1670+
16531671 // try to sample a different token to avoid empty messages
16541672 int attempts = 1000 ; // safeguard
16551673 while (emptyMessage == true && llama_token_is_eog (vocab, id) && attempts > 0 ) {
@@ -1738,7 +1756,7 @@ class chat
17381756 capture_smpl ();
17391757 // rewind_state.capture_kv_cache(llama_kv_cache_seq_pos_max(ctx, 0));
17401758 // rewind_state.capture_kv_cache(llama_kv_self_seq_pos_max(ctx, -1));
1741- rewind_state.capture_kv_cache (llama_kv_self_seq_pos_max (ctx , 0 ));
1759+ rewind_state.capture_kv_cache (llama_memory_seq_pos_max (mem , 0 ));
17421760 rewind_state.capture_embd_inp (embd_inp.size ());
17431761 rewind_state.capture_n_past (n_past);
17441762 rewind_state.capture_n_consumed (n_consumed);
@@ -1748,7 +1766,7 @@ class chat
17481766 int get_kv_cache_seq_pos_max () {
17491767 // return llama_kv_cache_seq_pos_max(ctx, 0);
17501768 // return llama_kv_self_seq_pos_max(ctx, -1);
1751- return llama_kv_self_seq_pos_max (ctx , 0 );
1769+ return llama_memory_seq_pos_max (mem , 0 );
17521770 }
17531771
17541772 void clearStates2 () {
@@ -1764,7 +1782,7 @@ class chat
17641782 restore_smpl ();
17651783 // common_sampler_reset(smpl);
17661784 // context
1767- llama_kv_self_seq_rm (ctx , 0 , rewind_state.kv_cache_pos , -1 );
1785+ llama_memory_seq_rm (mem , 0 , rewind_state.kv_cache_pos , -1 );
17681786 // llama_kv_self_seq_rm(ctx, -1, rewind_state.kv_cache_pos, -1);
17691787 // llama_kv_cache_update(ctx);
17701788 // chat parameters
0 commit comments