修复diffusion streaming模式下的结束chunk的信息错误

anyshu · anyshu · commit 893d4214e837 · 2025-10-09T09:40:42.000+08:00
diff --git a/tools/server/server-diffusion.cpp b/tools/server/server-diffusion.cpp
@@ -1304,7 +1304,14 @@ struct server_task_result_cmpl_partial : server_task_result {
             }
         } else if (!content.empty() && !is_progress) {
             // For diffusion or other tasks without diffs, send content directly
-            add_delta({{"content", content}});
+            // Try to parse content as JSON first (for diffusion delta arrays)
+            try {
+                json content_json = json::parse(content);
+                add_delta({{"content", content_json}});
+            } catch (...) {
+                // If parsing fails, treat as plain string
+                add_delta({{"content", content}});
+            }
         }
 
         if (!deltas.empty()) {
@@ -4108,27 +4115,30 @@ struct server_context {
                             // send the complete text as a single chunk before the final response
                             // In streaming mode with callbacks, the text was already sent incrementally
                             if (slot.params.stream) {
-                                // Check if we need to send any remaining text that wasn't sent by callback
-                                if (cb_data.last_sent_text != output_text && !output_text.empty()) {
-                                    std::string remaining_text = output_text.substr(cb_data.last_sent_text.length());
-                                    if (!remaining_text.empty()) {
-                                        completion_token_output result;
-                                        result.tok = -1;
-                                        result.text_to_send = remaining_text;
-                                        result.prob = 1.0f;
-                                        send_partial_response(slot, result, false);
-                                    }
-                                }
+                                // // Check if we need to send any remaining text that wasn't sent by callback
+                                // if (cb_data.last_sent_text != output_text && !output_text.empty()) {
+                                //     std::string remaining_text = output_text.substr(cb_data.last_sent_text.length());
+                                //     if (!remaining_text.empty()) {
+                                //         completion_token_output result;
+                                //         result.tok = -1;
+                                //         result.text_to_send = remaining_text;
+                                //         result.prob = 1.0f;
+                                //         send_partial_response(slot, result, false);
+                                //     }
+                                // }
+                                slot.generated_text = ""; // clear to avoid resending
+                                send_final_response(slot);
                             } else if (!output_text.empty()) {
                                 // Non-streaming: send complete text at once
                                 completion_token_output result;
                                 result.tok = -1;
                                 result.text_to_send = output_text;
                                 result.prob = 1.0f;
                                 send_partial_response(slot, result, false);
+                                send_final_response(slot);
                             }
                             
-                            send_final_response(slot);
+                           
                         } else {
                             send_error(slot, "Diffusion generation failed");
                         }
@@ -4807,45 +4817,51 @@ static bool diffusion_step_callback(int32_t             step,
         bool should_send = (step == 0) || 
                           (step == total_steps - 1);
         
-        // Also send if text has changed significantly (more tokens decoded)
-        if (!should_send && current_text.length() > data->last_sent_text.length() + 10) {
-            should_send = true;
-        }
+        // // Also send if text has changed significantly (more tokens decoded)
+        // if (!should_send && current_text.length() > data->last_sent_text.length() + 10) {
+        //     should_send = true;
+        // }
 
         //for chat/completions
-        if (should_send) {
-            std::string delta_text;
-            // Track token changes for debugging
+        if (true) {
+            json delta_json;
+            bool has_changes = false;
+            
+            // Track token changes and build JSON array
             if (data->last_tokens && step > 0) {
-                int32_t changed_tokens = 0;
-                std::string changes_debug;
+                json content_array = json::array();
+                
                 for (int32_t i = data->n_input; i < n_tokens && i < data->diff_params->max_length; i++) {
                     if (data->last_tokens[i] != tokens[i]) {
-                        changed_tokens++;
-                        if (changes_debug.length() < 200) { // Limit debug output
-                            char old_piece[64], new_piece[64];
-                            int old_n_chars = llama_token_to_piece(data->vocab, data->last_tokens[i], old_piece, sizeof(old_piece), 0, false);
-                            int new_n_chars = llama_token_to_piece(data->vocab, tokens[i], new_piece, sizeof(new_piece), 0, false);
-                            old_piece[old_n_chars] = '\0';
+                        char new_piece[256];
+                        int new_n_chars = llama_token_to_piece(data->vocab, tokens[i], new_piece, sizeof(new_piece), 0, false);
+                        if (new_n_chars > 0) {
                             new_piece[new_n_chars] = '\0';
-                            changes_debug += string_format("[%d: '%s'->'%s'] ", i - data->n_input, old_piece, new_piece);
+                            json change_obj = {
+                                {"position", i - data->n_input},
+                                {"text", std::string(new_piece)}
+                            };
+                            content_array.push_back(change_obj);
                         }
                     }
                 }
-                if (changed_tokens > 0) {
-                    delta_text = string_format("Token changes at step %d: %d tokens changed - %s\n",
-                            step, changed_tokens, changes_debug.c_str());
-                    SRV_INF("%s", delta_text.c_str());
+                
+                if (!content_array.empty()) {
+                    delta_json = content_array;
+                    has_changes = true;
+                    
+                    SRV_INF("Token changes at step %d: %zu positions changed\n", step, content_array.size());
+                    SRV_INF("Delta JSON: %s\n", delta_json.dump().c_str());
                 }
             }
             
-            if (!delta_text.empty()) {
-                SRV_INF("Sending diffusion delta: step=%d/%d, delta_len=%zu, delta=%s\n",
-                        step, total_steps, delta_text.length(), delta_text.c_str());
+            if (has_changes) {
+                SRV_INF("Sending diffusion delta: step=%d/%d, changes=%zu\n",
+                        step, total_steps, delta_json.size());
                 
                 completion_token_output progress_token;
                 progress_token.tok = -1; // Special value for progress
-                progress_token.text_to_send = delta_text;
+                progress_token.text_to_send = delta_json.dump(); // Serialize JSON to string
                 progress_token.prob = 1.0f;
                 
                 // Use is_progress=false to send actual content instead of progress info