feat: support batch response for mix instance.

JimHsiung · liutongxuan · commit 5b23f522f19b · 2025-11-05T16:44:26.000+08:00
diff --git a/xllm/core/scheduler/async_response_processor.cpp b/xllm/core/scheduler/async_response_processor.cpp
@@ -41,7 +41,7 @@ AsyncResponseProcessor::AsyncResponseProcessor(
       role_(role.value_or(InstanceRole::DEFAULT)),
       enable_schedule_overlap_(enable_schedule_overlap),
       enable_decode_response_to_service_(enable_decode_response_to_service) {
-  if (role_ == InstanceRole::DECODE) {
+  if (role_ == InstanceRole::DECODE || role_ == InstanceRole::MIX) {
     enable_batch_response_ =
         util::get_bool_env("ENABLE_PD_DECODE_BATCH_RESPONSE", true);
   }
@@ -257,8 +257,9 @@ void AsyncResponseProcessor::process_stream_request(
 }
 
 void AsyncResponseProcessor::process_stream_requests(
-    const std::vector<std::shared_ptr<Request>>& requests) {
-  if (!enable_batch_response_) {
+    const std::vector<std::shared_ptr<Request>>& requests,
+    bool is_prefill) {
+  if (!enable_batch_response_ || is_prefill) {
     for (auto& req : requests) {
       process_stream_request(req);
     }
diff --git a/xllm/core/scheduler/async_response_processor.h b/xllm/core/scheduler/async_response_processor.h
@@ -46,7 +46,8 @@ class AsyncResponseProcessor final {
 
   // in disagg pd mode, decode send requests' responses to prefill
   void process_stream_requests(
-      const std::vector<std::shared_ptr<Request>>& requests);
+      const std::vector<std::shared_ptr<Request>>& requests,
+      bool is_prefill);
 
   // wait for all responses in queue to be handled
   void wait_completion();
diff --git a/xllm/core/scheduler/continuous_scheduler.cpp b/xllm/core/scheduler/continuous_scheduler.cpp
@@ -1037,7 +1037,8 @@ void ContinuousScheduler::process_batch_output(bool enable_schedule_overlap) {
     }
   }
   if (!stream_requests.empty()) {
-    response_processor_->process_stream_requests(stream_requests);
+    response_processor_->process_stream_requests(stream_requests,
+                                                 last_step_prefill_);
   }
 }
 
diff --git a/xllm/core/scheduler/disagg_pd_scheduler.cpp b/xllm/core/scheduler/disagg_pd_scheduler.cpp
@@ -250,7 +250,7 @@ void DisaggPDScheduler::step(const absl::Duration& timeout) {
   ContinuousScheduler::step(timeout);
   // send first generation token to decode instance
   // and remove the request from running_requests_ to remote_requests_map_
-  if (options_.instance_role() != InstanceRole::DECODE) {
+  if (options_.instance_role() != InstanceRole::DECODE && last_step_prefill_) {
     prefill_send_first_generation();
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ AsyncResponseProcessor::AsyncResponseProcessor(`
`41`	`41`	`role_(role.value_or(InstanceRole::DEFAULT)),`
`42`	`42`	`enable_schedule_overlap_(enable_schedule_overlap),`
`43`	`43`	`enable_decode_response_to_service_(enable_decode_response_to_service) {`
`44`		`- if (role_ == InstanceRole::DECODE) {`
	`44`	`+ if (role_ == InstanceRole::DECODE \|\| role_ == InstanceRole::MIX) {`
`45`	`45`	`enable_batch_response_ =`
`46`	`46`	`util::get_bool_env("ENABLE_PD_DECODE_BATCH_RESPONSE", true);`
`47`	`47`	`}`
`@@ -257,8 +257,9 @@ void AsyncResponseProcessor::process_stream_request(`
`257`	`257`	`}`
`258`	`258`
`259`	`259`	`void AsyncResponseProcessor::process_stream_requests(`
`260`		`- const std::vector<std::shared_ptr<Request>>& requests) {`
`261`		`- if (!enable_batch_response_) {`
	`260`	`+ const std::vector<std::shared_ptr<Request>>& requests,`
	`261`	`+ bool is_prefill) {`
	`262`	`+ if (!enable_batch_response_ \|\| is_prefill) {`
`262`	`263`	`for (auto& req : requests) {`
`263`	`264`	`process_stream_request(req);`
`264`	`265`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1037,7 +1037,8 @@ void ContinuousScheduler::process_batch_output(bool enable_schedule_overlap) {`
`1037`	`1037`	`}`
`1038`	`1038`	`}`
`1039`	`1039`	`if (!stream_requests.empty()) {`
`1040`		`- response_processor_->process_stream_requests(stream_requests);`
	`1040`	`+ response_processor_->process_stream_requests(stream_requests,`
	`1041`	`+ last_step_prefill_);`
`1041`	`1042`	`}`
`1042`	`1043`	`}`
`1043`	`1044`
Original file line number	Diff line number	Diff line change
`@@ -250,7 +250,7 @@ void DisaggPDScheduler::step(const absl::Duration& timeout) {`
`250`	`250`	`ContinuousScheduler::step(timeout);`
`251`	`251`	`// send first generation token to decode instance`
`252`	`252`	`// and remove the request from running_requests_ to remote_requests_map_`
`253`		`- if (options_.instance_role() != InstanceRole::DECODE) {`
	`253`	`+ if (options_.instance_role() != InstanceRole::DECODE && last_step_prefill_) {`
`254`	`254`	`prefill_send_first_generation();`
`255`	`255`	`}`
`256`	`256`	`}`