@@ -2267,50 +2267,49 @@ struct server_context {
22672267                    continue ; //  continue loop of slots
22682268                }
22692269
2270-                 llama_token id;
2270+                 llama_token id =  common_sampler_sample (slot. smpl , ctx, slot. i_batch  - i) ;
22712271
2272-                 {
2273-                     completion_token_output result;
2274- 
2275-                     id = common_sampler_sample (slot.smpl , ctx, slot.i_batch  - i);
2272+                 slot.i_batch  = -1 ;
22762273
2277-                      slot.i_batch  = - 1 ;
2274+                 common_sampler_accept ( slot.smpl , id,  true ) ;
22782275
2279-                     common_sampler_accept (slot.smpl , id, true );
2280- 
2281-                     slot.n_decoded  += 1 ;
2282-                     if  (slot.n_decoded  == 1 ) {
2283-                         slot.t_start_generation  = ggml_time_us ();
2284-                         slot.t_prompt_processing  = (slot.t_start_generation  - slot.t_start_process_prompt ) / 1e3 ;
2285-                         metrics.on_prompt_eval (slot);
2286-                     }
2276+                 slot.n_decoded  += 1 ;
2277+                 if  (slot.n_decoded  == 1 ) {
2278+                     slot.t_start_generation  = ggml_time_us ();
2279+                     slot.t_prompt_processing  = (slot.t_start_generation  - slot.t_start_process_prompt ) / 1e3 ;
2280+                     metrics.on_prompt_eval (slot);
2281+                 }
22872282
2288-                     result.tok  = id;
2283+                 completion_token_output result;
2284+                 result.tok  = id;
22892285
2290-                      const  auto  * cur_p = common_sampler_get_candidates (slot.smpl );
2286+                 const  auto  * cur_p = common_sampler_get_candidates (slot.smpl );
22912287
2292-                      for  (size_t  i = 0 ; i < (size_t ) slot.params .sampling .n_probs ; ++i) {
2293-                          result.probs .push_back ({
2294-                              cur_p->data [i].id ,
2295-                                  i >= cur_p->size  ? 0 .0f  : cur_p->data [i].p ,
2296-                          });
2297-                      }
2288+                 for  (size_t  i = 0 ; i < (size_t ) slot.params .sampling .n_probs ; ++i) {
2289+                     result.probs .push_back ({
2290+                         cur_p->data [i].id ,
2291+                             i >= cur_p->size  ? 0 .0f  : cur_p->data [i].p ,
2292+                     });
2293+                 }
22982294
2299-                     if  (!process_token (result, slot)) {
2300-                         //  release slot because of stop condition
2301-                         slot.release ();
2302-                         slot.print_timings ();
2303-                         send_final_response (slot);
2304-                         metrics.on_prediction (slot);
2305-                         continue ;
2306-                     }
2295+                 if  (!process_token (result, slot)) {
2296+                     //  release slot because of stop condition
2297+                     slot.release ();
2298+                     slot.print_timings ();
2299+                     send_final_response (slot);
2300+                     metrics.on_prediction (slot);
2301+                     continue ;
23072302                }
2303+             }
23082304
2309-                 //  check if the slot supports speculative decoding
2310-                 if  (!slot.can_speculate ()) {
2305+             //  do speculative decoding
2306+             for  (auto  & slot : slots) {
2307+                 if  (!slot.is_processing () || !slot.can_speculate ()) {
23112308                    continue ;
23122309                }
23132310
2311+                 llama_token id = slot.sampled ;
2312+ 
23142313                struct  common_speculative_params  params_spec;
23152314                params_spec.n_draft    = slot.params .speculative .n_max ;
23162315                params_spec.n_reuse    = llama_n_ctx (slot.ctx_dft ) - slot.params .speculative .n_max ;
0 commit comments