@@ -45,6 +45,11 @@ bool ends_with (std::string const &fullString, std::string const &ending) {
4545 return false ;
4646 }
4747}
48+
49+ // Scattered through this file are additions from a currently unmerged pull request on llama.cpp that
50+ // sends progress during the evaluating stage. Pull these out if it's ever merged. -Brad 2025-07-27
51+ // Reference: https://github.com/ggml-org/llama.cpp/pull/14731/files
52+
4853// mmojo-server END
4954
5055using json = nlohmann::ordered_json;
@@ -128,6 +133,9 @@ struct slot_params {
128133 bool stream = true ;
129134 bool cache_prompt = true ; // remember the prompt to avoid reprocessing all prompt
130135 bool return_tokens = false ;
136+ // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
137+ bool include_prompt_progress = false ; // include prompt processing progress in streaming responses
138+ // mmojo-server END
131139
132140 int32_t n_keep = 0 ; // number of tokens to keep from initial prompt
133141 int32_t n_discard = 0 ; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -278,6 +286,11 @@ struct server_task {
278286 params.stream = json_value (data, " stream" , false );
279287 params.cache_prompt = json_value (data, " cache_prompt" , true );
280288 params.return_tokens = json_value (data, " return_tokens" , false );
289+
290+ // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
291+ params.include_prompt_progress = json_value (data, " include_prompt_progress" , false );
292+ // mmojo-server END
293+
281294 params.n_predict = json_value (data, " n_predict" , json_value (data, " max_tokens" , defaults.n_predict ));
282295 params.n_indent = json_value (data, " n_indent" , defaults.n_indent );
283296 params.n_keep = json_value (data, " n_keep" , defaults.n_keep );
@@ -919,6 +932,14 @@ struct server_task_result_cmpl_partial : server_task_result {
919932 completion_token_output prob_output;
920933 result_timings timings;
921934
935+ // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
936+ // Progress fields (only populated when is_progress_response is true)
937+ bool is_progress_response = false ;
938+ int32_t n_past = 0 ;
939+ int32_t n_prompt_tokens_processed = 0 ;
940+ float progress = 0 .0f ;
941+ // mmojo-server END
942+
922943 // OAI-compat fields
923944 bool verbose = false ;
924945 oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
@@ -965,6 +986,19 @@ struct server_task_result_cmpl_partial : server_task_result {
965986 if (!prob_output.probs .empty ()) {
966987 res[" completion_probabilities" ] = completion_token_output::probs_vector_to_json ({prob_output}, post_sampling_probs);
967988 }
989+
990+ // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
991+ // include prompt processing progress if this is a progress response
992+ if (is_progress_response) {
993+ res[" prompt_processing" ] = json {
994+ {" n_past" , n_past},
995+ {" n_prompt_tokens" , n_prompt_tokens},
996+ {" n_prompt_tokens_processed" , n_prompt_tokens_processed},
997+ {" progress" , progress},
998+ };
999+ }
1000+ // mmojo-server END
1001+
9681002 return res;
9691003 }
9701004
@@ -2538,6 +2572,71 @@ struct server_context {
25382572 queue_results.send (std::move (res));
25392573 }
25402574
2575+ // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
2576+ void send_progress_response (server_slot & slot) {
2577+ // Only send progress if explicitly requested and streaming is enabled
2578+ if (!slot.params .include_prompt_progress || !slot.params .stream ) {
2579+ return ;
2580+ }
2581+
2582+ // Calculate current progress percentage
2583+ float current_progress = slot.n_prompt_tokens > 0 ?
2584+ (float ) slot.n_prompt_tokens_processed / slot.n_prompt_tokens : 0 .0f ;
2585+
2586+ // Send progress updates at regular intervals (every 10% or significant changes)
2587+ static float last_progress = -1 .0f ;
2588+ static int last_slot_id = -1 ;
2589+
2590+ // Reset for new slot
2591+ if (slot.id_task != last_slot_id) {
2592+ last_progress = -1 .0f ;
2593+ last_slot_id = slot.id_task ;
2594+ }
2595+
2596+ /*
2597+ // This logic is WRONG. Just send_progress_response() after each completed batch. Set the batch size
2598+ // to like 64 on R-Pi with a 4B model. Voila. -Brad 2025-07-27
2599+
2600+ // Send progress if:
2601+ // 1. This is the first progress update (last_progress == -1)
2602+ // 2. Progress increased by at least 1% or processed at least 10 tokens
2603+ // 3. We've completed processing (current_progress >= 1.0)
2604+ bool should_send = (last_progress < 0.0f) ||
2605+ (current_progress - last_progress >= 0.01f) ||
2606+ (current_progress >= 1.0f && last_progress < 1.0f);
2607+
2608+ if (!should_send) {
2609+ return;
2610+ }
2611+ */
2612+
2613+ last_progress = current_progress;
2614+
2615+ auto res = std::make_unique<server_task_result_cmpl_partial>();
2616+
2617+ res->id = slot.id_task ;
2618+ res->index = slot.index ;
2619+ res->content = " " ; // empty content for progress responses
2620+ res->tokens = {}; // empty tokens for progress responses
2621+
2622+ res->n_decoded = 0 ; // no tokens decoded yet during prompt processing
2623+ res->n_prompt_tokens = slot.n_prompt_tokens ;
2624+
2625+ // Progress-specific fields
2626+ res->is_progress_response = true ;
2627+ res->n_past = slot.n_past ;
2628+ res->n_prompt_tokens_processed = slot.n_prompt_tokens_processed ;
2629+ res->progress = current_progress;
2630+
2631+ res->verbose = slot.params .verbose ;
2632+ res->oaicompat = slot.params .oaicompat ;
2633+ res->oaicompat_model = slot.params .oaicompat_model ;
2634+ res->oaicompat_cmpl_id = slot.params .oaicompat_cmpl_id ;
2635+
2636+ queue_results.send (std::move (res));
2637+ }
2638+ // mmojo-server END
2639+
25412640 void send_final_response (server_slot & slot) {
25422641 auto res = std::make_unique<server_task_result_cmpl_final>();
25432642 res->id = slot.id_task ;
@@ -3367,12 +3466,24 @@ struct server_context {
33673466
33683467 slot.n_prompt_tokens_processed ++;
33693468 slot.n_past ++;
3469+
3470+ // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
3471+ // THIS IS WRONG. The notifications all batch up at the end. Grrrrrr. -Brad 2025-07-27.
3472+
3473+ // Send incremental progress updates during token processing
3474+ // send_progress_response(slot);
3475+ // mmojo-server END
33703476 }
33713477
33723478 // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
33733479
33743480 SLT_INF (slot, " prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n " , slot.n_past , batch.n_tokens , (float ) slot.n_prompt_tokens_processed / slot.n_prompt_tokens );
33753481
3482+ // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
3483+ // Send progress response if requested
3484+ send_progress_response (slot);
3485+ // mmojo-server END
3486+
33763487 // entire prompt has been processed
33773488 if (slot.n_past == slot.n_prompt_tokens ) {
33783489 slot.state = SLOT_STATE_DONE_PROMPT;
0 commit comments