@@ -1738,7 +1738,7 @@ struct server_queue {
17381738
17391739 while (true ) {
17401740 QUE_DBG (" %s" , " processing new tasks\n " );
1741- const int64_t t_turn_start_us = ggml_time_us ();
1741+
17421742 while (true ) {
17431743 std::unique_lock<std::mutex> lock (mutex_tasks);
17441744 if (!running) {
@@ -1761,11 +1761,7 @@ struct server_queue {
17611761 QUE_DBG (" %s" , " update slots\n " );
17621762
17631763 callback_update_slots ();
1764- const int64_t t_turn_end_us = ggml_time_us ();
1765- SRV_DBG (
1766- " [PERF] Server turn time: %.2f ms\n " ,
1767- (t_turn_end_us - t_turn_start_us) / 1000.0
1768- );
1764+
17691765 QUE_DBG (" %s" , " waiting for new tasks\n " );
17701766 {
17711767 std::unique_lock<std::mutex> lock (mutex_tasks);
@@ -3471,7 +3467,6 @@ struct server_context {
34713467 batch.seq_id + i,
34723468 batch.logits + i,
34733469 };
3474- LOG_INF (" \n [DEBUG-CHUNK] Processing main model chunk. Batch size: %d\n " , n_tokens);
34753470
34763471 const int ret = llama_decode (ctx, batch_view);
34773472
@@ -3569,10 +3564,8 @@ struct server_context {
35693564 }
35703565 llama_token id = common_sampler_sample (slot.smpl , ctx, tok_idx);
35713566 slot.last_tok_idx = tok_idx;
3572- // SRV_INF("main loop sampled token: '%s'\n", common_token_to_piece(ctx, id, true).c_str());
35733567
35743568 slot.i_batch = -1 ;
3575- SLT_INF (slot, " [SAMPLER-ACCEPT] Accepting token ID %d at index %zu\n " , id, i);
35763569 common_sampler_accept (slot.smpl , id, true );
35773570
35783571 slot.n_decoded += 1 ;
@@ -3644,7 +3637,6 @@ struct server_context {
36443637
36453638 llama_tokens draft;
36463639 if (slot.has_mtp ) {
3647- SLT_INF (slot, " [POS-SYNC] Before draft gen. n_past = %d\n " , slot.n_past );
36483640 llama_token draft_id = mtp_speculative_gen_draft (slot.smpl , ctx, id, slot.n_past , slot.last_tok_idx );
36493641 draft.reserve (1 );
36503642 draft.push_back (draft_id);
@@ -3680,41 +3672,24 @@ struct server_context {
36803672 }
36813673
36823674 SLT_DBG (slot, " decoding speculative batch, size = %d\n " , slot.batch_spec .n_tokens );
3683- SLT_INF (slot, " [POS-SYNC] Before validation decode. n_past = %d, spec_batch_size = %d\n " , slot.n_past , slot.batch_spec .n_tokens );
36843675 llama_decode (ctx, slot.batch_spec );
36853676
3686- const size_t n_embd = llama_n_embd (llama_get_model (ctx));
3687- const size_t golden_buffer_size_in_floats = slot.batch_spec .n_tokens * n_embd;
3688- const float * golden_embd_ptr = llama_get_embeddings (ctx);
3689- double golden_checksum = calculate_vector_sum_double (golden_embd_ptr, golden_buffer_size_in_floats);
3690- SLT_INF (slot, " [VERIFY] Golden checksum after validation: %e (size: %zu tokens)\n " , golden_checksum, slot.batch_spec .n_tokens );
3691-
36923677 // the accepted tokens from the speculation
36933678 const auto ids = common_sampler_sample_and_accept_n (slot.smpl , ctx, draft);
3694- SLT_INF (slot, " [POS-SYNC] Tokens accepted: %zu\n " , ids.size ());
36953679
36963680 if (slot.has_mtp ) {
36973681 llama_set_draft_input_hidden_state (ctx, llama_get_embeddings_ith (ctx, ids.size () - 1 ));
36983682
3699- const float * embd_after_draft_ptr = llama_get_embeddings (ctx);
3700- double checksum_after_draft = calculate_vector_sum_double (embd_after_draft_ptr, golden_buffer_size_in_floats);
3701- SLT_INF (slot, " [VERIFY] Checksum after draft gen (should be unchanged): %e\n " , checksum_after_draft);
3702-
37033683 if (!ids.empty ()) {
37043684 llama_set_draft_input_hidden_state (ctx, llama_get_embeddings_ith (ctx, ids.size () - 1 ));
37053685 } else {
37063686 llama_set_draft_input_hidden_state (ctx, llama_get_embeddings_ith (ctx, 0 ));
37073687 }
37083688
37093689 mtp_accept_tokens (ctx, ids, slot.n_past , slot.id );
3710-
3711- const float * embd_after_update_ptr = llama_get_embeddings (ctx);
3712- double checksum_after_update = calculate_vector_sum_double (embd_after_update_ptr, golden_buffer_size_in_floats);
3713- SLT_INF (slot, " [VERIFY] Checksum after MTP update (should be unchanged): %e\n " , checksum_after_update);
37143690 }
37153691
37163692 slot.n_past += ids.size ();
3717- SLT_INF (slot, " [POS-SYNC] After n_past update. New n_past = %d\n " , slot.n_past );
37183693 slot.n_decoded += ids.size ();
37193694
37203695 // update how many tokens out of those tested were accepted
0 commit comments