File tree Expand file tree Collapse file tree 4 files changed +9
-6
lines changed
Expand file tree Collapse file tree 4 files changed +9
-6
lines changed Original file line number Diff line number Diff line change @@ -41,7 +41,7 @@ AsyncResponseProcessor::AsyncResponseProcessor(
4141 role_(role.value_or(InstanceRole::DEFAULT)),
4242 enable_schedule_overlap_(enable_schedule_overlap),
4343 enable_decode_response_to_service_(enable_decode_response_to_service) {
44- if (role_ == InstanceRole::DECODE) {
44+ if (role_ == InstanceRole::DECODE || role_ == InstanceRole::MIX ) {
4545 enable_batch_response_ =
4646 util::get_bool_env (" ENABLE_PD_DECODE_BATCH_RESPONSE" , true );
4747 }
@@ -257,8 +257,9 @@ void AsyncResponseProcessor::process_stream_request(
257257}
258258
259259void AsyncResponseProcessor::process_stream_requests (
260- const std::vector<std::shared_ptr<Request>>& requests) {
261- if (!enable_batch_response_) {
260+ const std::vector<std::shared_ptr<Request>>& requests,
261+ bool is_prefill) {
262+ if (!enable_batch_response_ || is_prefill) {
262263 for (auto & req : requests) {
263264 process_stream_request (req);
264265 }
Original file line number Diff line number Diff line change @@ -46,7 +46,8 @@ class AsyncResponseProcessor final {
4646
4747 // in disagg pd mode, decode send requests' responses to prefill
4848 void process_stream_requests (
49- const std::vector<std::shared_ptr<Request>>& requests);
49+ const std::vector<std::shared_ptr<Request>>& requests,
50+ bool is_prefill);
5051
5152 // wait for all responses in queue to be handled
5253 void wait_completion ();
Original file line number Diff line number Diff line change @@ -1037,7 +1037,8 @@ void ContinuousScheduler::process_batch_output(bool enable_schedule_overlap) {
10371037 }
10381038 }
10391039 if (!stream_requests.empty ()) {
1040- response_processor_->process_stream_requests (stream_requests);
1040+ response_processor_->process_stream_requests (stream_requests,
1041+ last_step_prefill_);
10411042 }
10421043}
10431044
Original file line number Diff line number Diff line change @@ -250,7 +250,7 @@ void DisaggPDScheduler::step(const absl::Duration& timeout) {
250250 ContinuousScheduler::step (timeout);
251251 // send first generation token to decode instance
252252 // and remove the request from running_requests_ to remote_requests_map_
253- if (options_.instance_role () != InstanceRole::DECODE) {
253+ if (options_.instance_role () != InstanceRole::DECODE && last_step_prefill_ ) {
254254 prefill_send_first_generation ();
255255 }
256256}
You can’t perform that action at this time.
0 commit comments