Skip to content

Commit 8c59eb3

Browse files
Merge pull request #102 from BradHutchings/work-in-progress
Progress API and UI for evaluating long inputs.
2 parents 3d4afbe + 7ee8e9f commit 8c59eb3

File tree

4 files changed

+148
-7
lines changed

4 files changed

+148
-7
lines changed

completion-ui/completion/scripts.js

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ const kMmojoCompletion = "Mmojo Completion";
2121
const kStatus_TypeSomething = "Awaiting your cue.";
2222
const kStatus_Ready = "Ready.";
2323
const kStatus_Evaluating = "Evaluating.";
24+
const kStatus_EvaluatingProgress = "Evaluating ";
25+
const kStatus_EvaulatingFinishing = "Finishing evaluating.";
2426
const kStatus_Generating = "Generating.";
2527
const kStatus_FinishedGenerating = "Finished generating.";
2628
const kStatus_StoppedByWord = "Stopped by \"[stopping_word]\".";
@@ -456,6 +458,8 @@ var manualStop = false;
456458
var generatedContent = '';
457459

458460
async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
461+
let logThis = true;
462+
459463
// show that we're working??
460464
SetStatus(kStatus_Evaluating);
461465

@@ -466,6 +470,7 @@ async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
466470
"n_predict": tokens,
467471
"temperature": temperature,
468472
"stream": true,
473+
"include_prompt_progress": true,
469474
}
470475

471476
if (stopWords.length > 0) {
@@ -530,15 +535,34 @@ async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
530535

531536
if (match) {
532537
lineData[match[1]] = match[2]; // data: { whatever }
533-
if (kLogging) console.log(match[1] + ": " + match[2]);
538+
if (kLogging || logThis) console.log(match[1] + ": " + match[2]);
534539
}
535540

536541
if (lineData.data) {
537542
lineData.data = JSON.parse(lineData.data);
538543

539-
if (kLogging) console.log(lineData.data);
544+
if (kLogging || logThis) console.log(lineData.data);
545+
546+
if ("prompt_processing" in lineData.data) {
547+
if (kLogging || logThis) console.log("Prompt processing:");
548+
if (kLogging || logThis) console.log(lineData.data.prompt_processing);
549+
550+
let n_past = lineData.data.prompt_processing.n_past;
551+
let n_prompt_tokens = lineData.data.prompt_processing.n_prompt_tokens;
552+
553+
if (kLogging || logThis) console.log("n_past: " + n_past);
554+
if (kLogging || logThis) console.log("n_prompt_tokens: " + n_prompt_tokens);
555+
556+
if (n_past < n_prompt_tokens) {
557+
let status = kStatus_EvaluatingProgress + " " + n_past + " / " + n_prompt_tokens;
558+
SetStatus(status);
559+
}
560+
else {
561+
SetStatus(kStatus_EvaulatingFinishing);
562+
}
563+
}
540564

541-
if ((lineData.data.stop_type == "word") && lineData.data.stopping_word !== "") {
565+
else if ((lineData.data.stop_type == "word") && lineData.data.stopping_word !== "") {
542566
// if (kLogging) console.log("stopping_word: " + lineData.data.stopping_word);
543567
SetStatus(kStatus_StoppedByWord.replace('[stopping_word]', lineData.data.stopping_word));
544568

@@ -563,7 +587,7 @@ async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
563587
// elements.workAreaText.scrollTop = elements.workAreaText.scrollHeight
564588
// don't set selectionStart, selectionEnd?
565589

566-
if (kLogging) console.log("end of strem");
590+
if (kLogging || logThis) console.log("end of strem");
567591
controller = null;
568592
ShowHideStatusButtons();
569593
}
@@ -598,9 +622,9 @@ async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
598622
}
599623
}
600624
catch(exc) {
601-
if (kLogging) console.log("Exception caught receiving results.");
602-
if (kLogging) console.log(exc.name);
603-
if (kLogging) console.log(exc.message);
625+
if (kLogging || logThis) console.log("Exception caught receiving results.");
626+
if (kLogging || logThis) console.log(exc.name);
627+
if (kLogging || logThis) console.log(exc.message);
604628

605629
// I thought this might be a checkbox in settings, but that felt clumsy.
606630
// These are mostly network errors. It would be good for the user to know.

docs/Build-mmojo-server-merge.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ unset UNAME_M; export UNAME_M
9393
make clean
9494
make
9595
mkdir -p Builds-Platform
96+
printf "Copying builds to Builds-Platform.\n"
9697
cp mmojo-* llama-* Builds-Platform
9798
9899
printf "\n**********\n*\n* FINISHED: Build llama.cpp.\n*\n**********\n\n"
@@ -169,6 +170,7 @@ printf "\n**********\n*\n* FINISHED: Build openssl with Cosmo.\n*\n**********\n\
169170
make clean
170171
make mmojo-server
171172
mkdir -p Builds-Cosmo
173+
printf "Copying builds to Builds-Cosmo.\n"
172174
cp mmojo-* Builds-Cosmo
173175
printf "\n**********\n*\n* FINISHED: Build mmojo-server with Cosmo\n*\n**********\n\n"
174176
```
@@ -221,6 +223,7 @@ printf "\n**********\n*\n* FINISHED: Prepare to Build llama.cpp with Cosmo.\n*\n
221223
make clean
222224
make mmojo-server
223225
mkdir -p Builds-Cosmo-x86_64
226+
printf "Copying builds to Builds-Cosmo-x86_64.\n"
224227
cp mmojo-* Builds-Cosmo-x86_64
225228
printf "\n**********\n*\n* FINISHED: Build mmojo-server with Cosmo\n*\n**********\n\n"
226229
```
@@ -271,6 +274,7 @@ printf "\n**********\n*\n* FINISHED: Prepare to Build llama.cpp with Cosmo.\n*\n
271274
make clean
272275
make mmojo-server
273276
mkdir -p Builds-Cosmo-Aarch64
277+
printf "Copying builds to Builds-Cosmo-Aarch64.\n"
274278
cp mmojo-* Builds-Cosmo-Aarch64
275279
printf "\n**********\n*\n* FINISHED: Build mmojo-server with Cosmo\n*\n**********\n\n"
276280
```

docs/Configure-mmojo-server-merge.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ model.gguf
145145
0
146146
--threads-http
147147
8
148+
--batch-size
149+
64
148150
--path
149151
/zip/website
150152
--ssl-key-file

tools/server/server-mmojo.cpp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ bool ends_with (std::string const &fullString, std::string const &ending) {
4545
return false;
4646
}
4747
}
48+
49+
// Scattered through this file are additions from a currently unmerged pull request on llama.cpp that
50+
// sends progress during the evaluating stage. Pull these out if it's ever merged. -Brad 2025-07-27
51+
// Reference: https://github.com/ggml-org/llama.cpp/pull/14731/files
52+
4853
// mmojo-server END
4954

5055
using json = nlohmann::ordered_json;
@@ -128,6 +133,9 @@ struct slot_params {
128133
bool stream = true;
129134
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
130135
bool return_tokens = false;
136+
// mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
137+
bool include_prompt_progress = false; // include prompt processing progress in streaming responses
138+
// mmojo-server END
131139

132140
int32_t n_keep = 0; // number of tokens to keep from initial prompt
133141
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -278,6 +286,11 @@ struct server_task {
278286
params.stream = json_value(data, "stream", false);
279287
params.cache_prompt = json_value(data, "cache_prompt", true);
280288
params.return_tokens = json_value(data, "return_tokens", false);
289+
290+
// mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
291+
params.include_prompt_progress = json_value(data, "include_prompt_progress", false);
292+
// mmojo-server END
293+
281294
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
282295
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
283296
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
@@ -919,6 +932,14 @@ struct server_task_result_cmpl_partial : server_task_result {
919932
completion_token_output prob_output;
920933
result_timings timings;
921934

935+
// mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
936+
// Progress fields (only populated when is_progress_response is true)
937+
bool is_progress_response = false;
938+
int32_t n_past = 0;
939+
int32_t n_prompt_tokens_processed = 0;
940+
float progress = 0.0f;
941+
// mmojo-server END
942+
922943
// OAI-compat fields
923944
bool verbose = false;
924945
oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
@@ -965,6 +986,19 @@ struct server_task_result_cmpl_partial : server_task_result {
965986
if (!prob_output.probs.empty()) {
966987
res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
967988
}
989+
990+
// mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
991+
// include prompt processing progress if this is a progress response
992+
if (is_progress_response) {
993+
res["prompt_processing"] = json {
994+
{"n_past", n_past},
995+
{"n_prompt_tokens", n_prompt_tokens},
996+
{"n_prompt_tokens_processed", n_prompt_tokens_processed},
997+
{"progress", progress},
998+
};
999+
}
1000+
// mmojo-server END
1001+
9681002
return res;
9691003
}
9701004

@@ -2538,6 +2572,71 @@ struct server_context {
25382572
queue_results.send(std::move(res));
25392573
}
25402574

2575+
// mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
2576+
void send_progress_response(server_slot & slot) {
2577+
// Only send progress if explicitly requested and streaming is enabled
2578+
if (!slot.params.include_prompt_progress || !slot.params.stream) {
2579+
return;
2580+
}
2581+
2582+
// Calculate current progress percentage
2583+
float current_progress = slot.n_prompt_tokens > 0 ?
2584+
(float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens : 0.0f;
2585+
2586+
// Send progress updates at regular intervals (every 10% or significant changes)
2587+
static float last_progress = -1.0f;
2588+
static int last_slot_id = -1;
2589+
2590+
// Reset for new slot
2591+
if (slot.id_task != last_slot_id) {
2592+
last_progress = -1.0f;
2593+
last_slot_id = slot.id_task;
2594+
}
2595+
2596+
/*
2597+
// This logic is WRONG. Just send_progress_response() after each completed batch. Set the batch size
2598+
// to like 64 on R-Pi with a 4B model. Voila. -Brad 2025-07-27
2599+
2600+
// Send progress if:
2601+
// 1. This is the first progress update (last_progress == -1)
2602+
// 2. Progress increased by at least 1% or processed at least 10 tokens
2603+
// 3. We've completed processing (current_progress >= 1.0)
2604+
bool should_send = (last_progress < 0.0f) ||
2605+
(current_progress - last_progress >= 0.01f) ||
2606+
(current_progress >= 1.0f && last_progress < 1.0f);
2607+
2608+
if (!should_send) {
2609+
return;
2610+
}
2611+
*/
2612+
2613+
last_progress = current_progress;
2614+
2615+
auto res = std::make_unique<server_task_result_cmpl_partial>();
2616+
2617+
res->id = slot.id_task;
2618+
res->index = slot.index;
2619+
res->content = ""; // empty content for progress responses
2620+
res->tokens = {}; // empty tokens for progress responses
2621+
2622+
res->n_decoded = 0; // no tokens decoded yet during prompt processing
2623+
res->n_prompt_tokens = slot.n_prompt_tokens;
2624+
2625+
// Progress-specific fields
2626+
res->is_progress_response = true;
2627+
res->n_past = slot.n_past;
2628+
res->n_prompt_tokens_processed = slot.n_prompt_tokens_processed;
2629+
res->progress = current_progress;
2630+
2631+
res->verbose = slot.params.verbose;
2632+
res->oaicompat = slot.params.oaicompat;
2633+
res->oaicompat_model = slot.params.oaicompat_model;
2634+
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
2635+
2636+
queue_results.send(std::move(res));
2637+
}
2638+
// mmojo-server END
2639+
25412640
void send_final_response(server_slot & slot) {
25422641
auto res = std::make_unique<server_task_result_cmpl_final>();
25432642
res->id = slot.id_task;
@@ -3367,12 +3466,24 @@ struct server_context {
33673466

33683467
slot.n_prompt_tokens_processed++;
33693468
slot.n_past++;
3469+
3470+
// mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
3471+
// THIS IS WRONG. The notifications all batch up at the end. Grrrrrr. -Brad 2025-07-27.
3472+
3473+
// Send incremental progress updates during token processing
3474+
// send_progress_response(slot);
3475+
// mmojo-server END
33703476
}
33713477

33723478
// SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
33733479

33743480
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
33753481

3482+
// mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
3483+
// Send progress response if requested
3484+
send_progress_response(slot);
3485+
// mmojo-server END
3486+
33763487
// entire prompt has been processed
33773488
if (slot.n_past == slot.n_prompt_tokens) {
33783489
slot.state = SLOT_STATE_DONE_PROMPT;

0 commit comments

Comments
 (0)