@@ -1304,7 +1304,14 @@ struct server_task_result_cmpl_partial : server_task_result {
1304
1304
}
1305
1305
} else if (!content.empty () && !is_progress) {
1306
1306
// For diffusion or other tasks without diffs, send content directly
1307
- add_delta ({{" content" , content}});
1307
+ // Try to parse content as JSON first (for diffusion delta arrays)
1308
+ try {
1309
+ json content_json = json::parse (content);
1310
+ add_delta ({{" content" , content_json}});
1311
+ } catch (...) {
1312
+ // If parsing fails, treat as plain string
1313
+ add_delta ({{" content" , content}});
1314
+ }
1308
1315
}
1309
1316
1310
1317
if (!deltas.empty ()) {
@@ -4108,27 +4115,30 @@ struct server_context {
4108
4115
// send the complete text as a single chunk before the final response
4109
4116
// In streaming mode with callbacks, the text was already sent incrementally
4110
4117
if (slot.params .stream ) {
4111
- // Check if we need to send any remaining text that wasn't sent by callback
4112
- if (cb_data.last_sent_text != output_text && !output_text.empty ()) {
4113
- std::string remaining_text = output_text.substr (cb_data.last_sent_text .length ());
4114
- if (!remaining_text.empty ()) {
4115
- completion_token_output result;
4116
- result.tok = -1 ;
4117
- result.text_to_send = remaining_text;
4118
- result.prob = 1 .0f ;
4119
- send_partial_response (slot, result, false );
4120
- }
4121
- }
4118
+ // // Check if we need to send any remaining text that wasn't sent by callback
4119
+ // if (cb_data.last_sent_text != output_text && !output_text.empty()) {
4120
+ // std::string remaining_text = output_text.substr(cb_data.last_sent_text.length());
4121
+ // if (!remaining_text.empty()) {
4122
+ // completion_token_output result;
4123
+ // result.tok = -1;
4124
+ // result.text_to_send = remaining_text;
4125
+ // result.prob = 1.0f;
4126
+ // send_partial_response(slot, result, false);
4127
+ // }
4128
+ // }
4129
+ slot.generated_text = " " ; // clear to avoid resending
4130
+ send_final_response (slot);
4122
4131
} else if (!output_text.empty ()) {
4123
4132
// Non-streaming: send complete text at once
4124
4133
completion_token_output result;
4125
4134
result.tok = -1 ;
4126
4135
result.text_to_send = output_text;
4127
4136
result.prob = 1 .0f ;
4128
4137
send_partial_response (slot, result, false );
4138
+ send_final_response (slot);
4129
4139
}
4130
4140
4131
- send_final_response (slot);
4141
+
4132
4142
} else {
4133
4143
send_error (slot, " Diffusion generation failed" );
4134
4144
}
@@ -4807,45 +4817,51 @@ static bool diffusion_step_callback(int32_t step,
4807
4817
bool should_send = (step == 0 ) ||
4808
4818
(step == total_steps - 1 );
4809
4819
4810
- // Also send if text has changed significantly (more tokens decoded)
4811
- if (!should_send && current_text.length () > data->last_sent_text .length () + 10 ) {
4812
- should_send = true ;
4813
- }
4820
+ // // Also send if text has changed significantly (more tokens decoded)
4821
+ // if (!should_send && current_text.length() > data->last_sent_text.length() + 10) {
4822
+ // should_send = true;
4823
+ // }
4814
4824
4815
4825
// for chat/completions
4816
- if (should_send) {
4817
- std::string delta_text;
4818
- // Track token changes for debugging
4826
+ if (true ) {
4827
+ json delta_json;
4828
+ bool has_changes = false ;
4829
+
4830
+ // Track token changes and build JSON array
4819
4831
if (data->last_tokens && step > 0 ) {
4820
- int32_t changed_tokens = 0 ;
4821
- std::string changes_debug;
4832
+ json content_array = json::array () ;
4833
+
4822
4834
for (int32_t i = data->n_input ; i < n_tokens && i < data->diff_params ->max_length ; i++) {
4823
4835
if (data->last_tokens [i] != tokens[i]) {
4824
- changed_tokens++;
4825
- if (changes_debug.length () < 200 ) { // Limit debug output
4826
- char old_piece[64 ], new_piece[64 ];
4827
- int old_n_chars = llama_token_to_piece (data->vocab , data->last_tokens [i], old_piece, sizeof (old_piece), 0 , false );
4828
- int new_n_chars = llama_token_to_piece (data->vocab , tokens[i], new_piece, sizeof (new_piece), 0 , false );
4829
- old_piece[old_n_chars] = ' \0 ' ;
4836
+ char new_piece[256 ];
4837
+ int new_n_chars = llama_token_to_piece (data->vocab , tokens[i], new_piece, sizeof (new_piece), 0 , false );
4838
+ if (new_n_chars > 0 ) {
4830
4839
new_piece[new_n_chars] = ' \0 ' ;
4831
- changes_debug += string_format (" [%d: '%s'->'%s'] " , i - data->n_input , old_piece, new_piece);
4840
+ json change_obj = {
4841
+ {" position" , i - data->n_input },
4842
+ {" text" , std::string (new_piece)}
4843
+ };
4844
+ content_array.push_back (change_obj);
4832
4845
}
4833
4846
}
4834
4847
}
4835
- if (changed_tokens > 0 ) {
4836
- delta_text = string_format (" Token changes at step %d: %d tokens changed - %s\n " ,
4837
- step, changed_tokens, changes_debug.c_str ());
4838
- SRV_INF (" %s" , delta_text.c_str ());
4848
+
4849
+ if (!content_array.empty ()) {
4850
+ delta_json = content_array;
4851
+ has_changes = true ;
4852
+
4853
+ SRV_INF (" Token changes at step %d: %zu positions changed\n " , step, content_array.size ());
4854
+ SRV_INF (" Delta JSON: %s\n " , delta_json.dump ().c_str ());
4839
4855
}
4840
4856
}
4841
4857
4842
- if (!delta_text. empty () ) {
4843
- SRV_INF (" Sending diffusion delta: step=%d/%d, delta_len =%zu, delta=%s \n " ,
4844
- step, total_steps, delta_text. length (), delta_text. c_str ());
4858
+ if (has_changes ) {
4859
+ SRV_INF (" Sending diffusion delta: step=%d/%d, changes =%zu\n " ,
4860
+ step, total_steps, delta_json. size ());
4845
4861
4846
4862
completion_token_output progress_token;
4847
4863
progress_token.tok = -1 ; // Special value for progress
4848
- progress_token.text_to_send = delta_text;
4864
+ progress_token.text_to_send = delta_json. dump (); // Serialize JSON to string
4849
4865
progress_token.prob = 1 .0f ;
4850
4866
4851
4867
// Use is_progress=false to send actual content instead of progress info
0 commit comments