Merge pull request #102 from BradHutchings/work-in-progress

BradHutchings · web-flow · commit 8c59eb38d757 · 2025-07-27T19:01:22.000-07:00
Progress API and UI for evaluating long inputs.
diff --git a/completion-ui/completion/scripts.js b/completion-ui/completion/scripts.js
@@ -21,6 +21,8 @@ const kMmojoCompletion = "Mmojo Completion";
 const kStatus_TypeSomething = "Awaiting your cue.";
 const kStatus_Ready = "Ready.";
 const kStatus_Evaluating = "Evaluating.";
+const kStatus_EvaluatingProgress = "Evaluating ";
+const kStatus_EvaulatingFinishing = "Finishing evaluating.";
 const kStatus_Generating = "Generating.";
 const kStatus_FinishedGenerating = "Finished generating.";
 const kStatus_StoppedByWord = "Stopped by \"[stopping_word]\".";
@@ -456,6 +458,8 @@ var manualStop = false;
 var generatedContent = '';
 
 async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
+    let logThis = true;
+
     // show that we're working??
     SetStatus(kStatus_Evaluating);
 
@@ -466,6 +470,7 @@ async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
         "n_predict": tokens,
         "temperature": temperature,
         "stream": true,
+        "include_prompt_progress": true,
     }
 
     if (stopWords.length > 0) {
@@ -530,15 +535,34 @@ async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
             
                 if (match) {
                     lineData[match[1]] = match[2];        //  data: { whatever }
-                    if (kLogging) console.log(match[1] + ": " + match[2]);
+                    if (kLogging || logThis) console.log(match[1] + ": " + match[2]);
                 }
 
                 if (lineData.data) {
                     lineData.data = JSON.parse(lineData.data);
 
-                    if (kLogging) console.log(lineData.data);
+                    if (kLogging || logThis) console.log(lineData.data);
+
+                    if ("prompt_processing" in lineData.data) {
+                        if (kLogging || logThis) console.log("Prompt processing:");
+                        if (kLogging || logThis) console.log(lineData.data.prompt_processing);
+
+                        let n_past = lineData.data.prompt_processing.n_past;
+                        let n_prompt_tokens = lineData.data.prompt_processing.n_prompt_tokens;
+
+                        if (kLogging || logThis) console.log("n_past: " + n_past);
+                        if (kLogging || logThis) console.log("n_prompt_tokens: " + n_prompt_tokens);
+
+                        if (n_past < n_prompt_tokens) {
+                            let status = kStatus_EvaluatingProgress + " " + n_past + " / " + n_prompt_tokens;
+                            SetStatus(status);
+                        }
+                        else {
+                            SetStatus(kStatus_EvaulatingFinishing);
+                        }
+                    }
 
-                    if ((lineData.data.stop_type == "word") && lineData.data.stopping_word !== "") {
+                    else if ((lineData.data.stop_type == "word") && lineData.data.stopping_word !== "") {
                         // if (kLogging) console.log("stopping_word: " + lineData.data.stopping_word);
                         SetStatus(kStatus_StoppedByWord.replace('[stopping_word]', lineData.data.stopping_word));
             
@@ -563,7 +587,7 @@ async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
                         // elements.workAreaText.scrollTop = elements.workAreaText.scrollHeight
                         // don't set selectionStart, selectionEnd?
 
-                        if (kLogging) console.log("end of strem");
+                        if (kLogging || logThis) console.log("end of strem");
                         controller = null;
                         ShowHideStatusButtons();
                     }
@@ -598,9 +622,9 @@ async function StartGenerating(workAreaText, temperature, tokens, stopWords) {
         }
     }
     catch(exc) {
-        if (kLogging) console.log("Exception caught receiving results.");
-        if (kLogging) console.log(exc.name);
-        if (kLogging) console.log(exc.message);
+        if (kLogging || logThis) console.log("Exception caught receiving results.");
+        if (kLogging || logThis) console.log(exc.name);
+        if (kLogging || logThis) console.log(exc.message);
 
         // I thought this might be a checkbox in settings, but that felt clumsy.
         // These are mostly network errors. It would be good for the user to know.
diff --git a/docs/Build-mmojo-server-merge.md b/docs/Build-mmojo-server-merge.md
@@ -93,6 +93,7 @@ unset UNAME_M; export UNAME_M
 make clean
 make
 mkdir -p Builds-Platform
+printf "Copying builds to Builds-Platform.\n"
 cp mmojo-* llama-* Builds-Platform
 
 printf "\n**********\n*\n* FINISHED: Build llama.cpp.\n*\n**********\n\n"
@@ -169,6 +170,7 @@ printf "\n**********\n*\n* FINISHED: Build openssl with Cosmo.\n*\n**********\n\
 make clean
 make mmojo-server
 mkdir -p Builds-Cosmo
+printf "Copying builds to Builds-Cosmo.\n"
 cp mmojo-* Builds-Cosmo
 printf "\n**********\n*\n* FINISHED: Build mmojo-server with Cosmo\n*\n**********\n\n"
 ```
@@ -221,6 +223,7 @@ printf "\n**********\n*\n* FINISHED: Prepare to Build llama.cpp with Cosmo.\n*\n
 make clean
 make mmojo-server
 mkdir -p Builds-Cosmo-x86_64
+printf "Copying builds to Builds-Cosmo-x86_64.\n"
 cp mmojo-* Builds-Cosmo-x86_64
 printf "\n**********\n*\n* FINISHED: Build mmojo-server with Cosmo\n*\n**********\n\n"
 ```
@@ -271,6 +274,7 @@ printf "\n**********\n*\n* FINISHED: Prepare to Build llama.cpp with Cosmo.\n*\n
 make clean
 make mmojo-server
 mkdir -p Builds-Cosmo-Aarch64
+printf "Copying builds to Builds-Cosmo-Aarch64.\n"
 cp mmojo-* Builds-Cosmo-Aarch64
 printf "\n**********\n*\n* FINISHED: Build mmojo-server with Cosmo\n*\n**********\n\n"
 ```
diff --git a/docs/Configure-mmojo-server-merge.md b/docs/Configure-mmojo-server-merge.md
@@ -145,6 +145,8 @@ model.gguf
 0
 --threads-http
 8
+--batch-size
+64
 --path
 /zip/website
 --ssl-key-file
diff --git a/tools/server/server-mmojo.cpp b/tools/server/server-mmojo.cpp
@@ -45,6 +45,11 @@ bool ends_with (std::string const &fullString, std::string const &ending) {
         return false;
     } 
 }
+
+// Scattered through this file are additions from a currently unmerged pull request on llama.cpp that 
+// sends progress during the evaluating stage. Pull these out if it's ever merged. -Brad 2025-07-27
+// Reference: https://github.com/ggml-org/llama.cpp/pull/14731/files
+
 // mmojo-server END
 
 using json = nlohmann::ordered_json;
@@ -128,6 +133,9 @@ struct slot_params {
     bool stream        = true;
     bool cache_prompt  = true; // remember the prompt to avoid reprocessing all prompt
     bool return_tokens = false;
+    // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
+    bool include_prompt_progress  = false; // include prompt processing progress in streaming responses
+    // mmojo-server END
 
     int32_t n_keep    =  0; // number of tokens to keep from initial prompt
     int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@@ -278,6 +286,11 @@ struct server_task {
         params.stream           = json_value(data, "stream",             false);
         params.cache_prompt     = json_value(data, "cache_prompt",       true);
         params.return_tokens    = json_value(data, "return_tokens",      false);
+        
+        // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
+        params.include_prompt_progress = json_value(data, "include_prompt_progress", false);
+        // mmojo-server END
+
         params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
         params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
         params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
@@ -919,6 +932,14 @@ struct server_task_result_cmpl_partial : server_task_result {
     completion_token_output prob_output;
     result_timings timings;
 
+    // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
+    // Progress fields (only populated when is_progress_response is true)
+    bool is_progress_response = false;
+    int32_t n_past = 0;
+    int32_t n_prompt_tokens_processed = 0;
+    float progress = 0.0f;
+    // mmojo-server END
+
     // OAI-compat fields
     bool            verbose   = false;
     oaicompat_type  oaicompat = OAICOMPAT_TYPE_NONE;
@@ -965,6 +986,19 @@ struct server_task_result_cmpl_partial : server_task_result {
         if (!prob_output.probs.empty()) {
             res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
         }
+        
+        // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
+        // include prompt processing progress if this is a progress response
+        if (is_progress_response) {
+            res["prompt_processing"] = json {
+                {"n_past", n_past},
+                {"n_prompt_tokens", n_prompt_tokens},
+                {"n_prompt_tokens_processed", n_prompt_tokens_processed},
+                {"progress", progress},
+            };
+        }
+        // mmojo-server END
+
         return res;
     }
 
@@ -2538,6 +2572,71 @@ struct server_context {
         queue_results.send(std::move(res));
     }
 
+    // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
+    void send_progress_response(server_slot & slot) {
+        // Only send progress if explicitly requested and streaming is enabled
+        if (!slot.params.include_prompt_progress || !slot.params.stream) {
+            return;
+        }
+
+        // Calculate current progress percentage
+        float current_progress = slot.n_prompt_tokens > 0 ? 
+                                (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens : 0.0f;
+
+        // Send progress updates at regular intervals (every 10% or significant changes)
+        static float last_progress = -1.0f;
+        static int last_slot_id = -1;
+
+        // Reset for new slot
+        if (slot.id_task != last_slot_id) {
+            last_progress = -1.0f;
+            last_slot_id = slot.id_task;
+        }
+
+        /*
+        // This logic is WRONG. Just send_progress_response() after each completed batch. Set the batch size
+        // to like 64 on R-Pi with a 4B model. Voila. -Brad 2025-07-27
+        
+        // Send progress if:
+        // 1. This is the first progress update (last_progress == -1)
+        // 2. Progress increased by at least 1% or processed at least 10 tokens
+        // 3. We've completed processing (current_progress >= 1.0)
+        bool should_send = (last_progress < 0.0f) || 
+                          (current_progress - last_progress >= 0.01f) || 
+                          (current_progress >= 1.0f && last_progress < 1.0f);
+
+        if (!should_send) {
+            return;
+        }
+        */
+
+        last_progress = current_progress;
+
+        auto res = std::make_unique<server_task_result_cmpl_partial>();
+
+        res->id      = slot.id_task;
+        res->index   = slot.index;
+        res->content = ""; // empty content for progress responses
+        res->tokens  = {}; // empty tokens for progress responses
+
+        res->n_decoded       = 0; // no tokens decoded yet during prompt processing
+        res->n_prompt_tokens = slot.n_prompt_tokens;
+
+        // Progress-specific fields
+        res->is_progress_response        = true;
+        res->n_past                      = slot.n_past;
+        res->n_prompt_tokens_processed   = slot.n_prompt_tokens_processed;
+        res->progress                    = current_progress;
+
+        res->verbose               = slot.params.verbose;
+        res->oaicompat             = slot.params.oaicompat;
+        res->oaicompat_model       = slot.params.oaicompat_model;
+        res->oaicompat_cmpl_id     = slot.params.oaicompat_cmpl_id;
+
+        queue_results.send(std::move(res));
+    }
+    // mmojo-server END
+
     void send_final_response(server_slot & slot) {
         auto res = std::make_unique<server_task_result_cmpl_final>();
         res->id              = slot.id_task;
@@ -3367,12 +3466,24 @@ struct server_context {
 
                         slot.n_prompt_tokens_processed++;
                         slot.n_past++;
+
+                        // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
+                        // THIS IS WRONG. The notifications all batch up at the end. Grrrrrr. -Brad 2025-07-27.
+
+                        // Send incremental progress updates during token processing
+                        // send_progress_response(slot);
+                        // mmojo-server END
                     }
 
                     // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
 
                     SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
+                    // mmojo-server START -- https://github.com/ggml-org/llama.cpp/pull/14731/files
+                    // Send progress response if requested
+                    send_progress_response(slot);
+                    // mmojo-server END
+
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
                         slot.state = SLOT_STATE_DONE_PROMPT;

Original file line number	Diff line number	Diff line change
`@@ -145,6 +145,8 @@ model.gguf`
`145`	`145`	`0`
`146`	`146`	`--threads-http`
`147`	`147`	`8`
	`148`	`+--batch-size`
	`149`	`+64`
`148`	`150`	`--path`
`149`	`151`	`/zip/website`
`150`	`152`	`--ssl-key-file`