@@ -138,12 +138,12 @@ def __init__(self,
138
138
# schedule and execute batches, and is required by pipeline parallelism
139
139
# to eliminate pipeline bubbles.
140
140
self .batch_queue_size = self .model_executor .max_concurrent_batches
141
- self .batch_queue : Optional [queue . Queue [tuple [Future [ModelRunnerOutput ],
142
- SchedulerOutput ]]] = None
141
+ self .batch_queue : Optional [deque [tuple [Future [ModelRunnerOutput ],
142
+ SchedulerOutput ]]] = None
143
143
if self .batch_queue_size > 1 :
144
144
logger .info ("Batch queue is enabled with size %d" ,
145
145
self .batch_queue_size )
146
- self .batch_queue = queue . Queue ( self .batch_queue_size )
146
+ self .batch_queue = deque ( maxlen = self .batch_queue_size )
147
147
148
148
self .request_block_hasher : Optional [Callable [[Request ],
149
149
list [BlockHash ]]] = None
@@ -319,41 +319,43 @@ def step_with_batch_queue(
319
319
batch in the job queue is finished.
320
320
3. Update the scheduler from the output.
321
321
"""
322
- assert self .batch_queue is not None
322
+ batch_queue = self .batch_queue
323
+ assert batch_queue is not None
323
324
324
- engine_core_outputs = None
325
- scheduler_output = None
326
325
# Try to schedule a new batch if the batch queue is not full, but
327
326
# the scheduler may return an empty batch if all requests are scheduled.
328
327
# Note that this is not blocking.
329
- if not self .batch_queue .full ():
330
- scheduler_output = self .scheduler .schedule ()
331
- if scheduler_output .total_num_scheduled_tokens > 0 :
332
- future = self .model_executor .execute_model (scheduler_output )
333
- self .batch_queue .put_nowait (
334
- (future , scheduler_output )) # type: ignore
335
-
336
- scheduled_batch = (scheduler_output is not None
337
- and scheduler_output .total_num_scheduled_tokens > 0 )
338
-
339
- # If no more requests can be scheduled and the job queue is not empty,
340
- # block until the first batch in the job queue is finished.
341
- # TODO(comaniac): Ideally we should peek the first batch in the
342
- # job queue to check if it's finished before scheduling a new batch,
343
- # but peeking the first element in a queue is not thread-safe,
344
- # so we need more work.
345
- if not scheduled_batch and not self .batch_queue .empty ():
346
- future , scheduler_output = self .batch_queue .get_nowait ()
328
+ assert len (batch_queue ) < self .batch_queue_size
347
329
348
- # Blocking until the first result is available.
349
- model_output = self .execute_model_with_error_logging (
350
- lambda _ : future .result (), scheduler_output )
330
+ model_executed = False
331
+ if self .scheduler .has_requests ():
332
+ scheduler_output = self .scheduler .schedule ()
333
+ future = self .model_executor .execute_model (scheduler_output )
334
+ batch_queue .appendleft (
335
+ (future , scheduler_output )) # type: ignore[arg-type]
336
+
337
+ model_executed = scheduler_output .total_num_scheduled_tokens > 0
338
+ if model_executed and len (batch_queue ) < self .batch_queue_size \
339
+ and not batch_queue [- 1 ][0 ].done ():
340
+ # Don't block on next worker response unless the queue is full
341
+ # or there are no more requests to schedule.
342
+ return None , True
343
+
344
+ elif not batch_queue :
345
+ # Queue is empty. We should not reach here since this method should
346
+ # only be called when the scheduler contains requests or the queue
347
+ # is non-empty.
348
+ return None , False
349
+
350
+ # Block until the next result is available.
351
+ future , scheduler_output = batch_queue .pop ()
352
+ model_output = self .execute_model_with_error_logging (
353
+ lambda _ : future .result (), scheduler_output )
351
354
352
- self .batch_queue .task_done ()
353
- engine_core_outputs = (self .scheduler .update_from_output (
354
- scheduler_output , model_output ))
355
+ engine_core_outputs = self .scheduler .update_from_output (
356
+ scheduler_output , model_output )
355
357
356
- return engine_core_outputs , scheduled_batch
358
+ return engine_core_outputs , model_executed
357
359
358
360
def shutdown (self ):
359
361
self .structured_output_manager .clear_backend ()
@@ -388,7 +390,7 @@ def is_sleeping(self) -> bool:
388
390
return self .model_executor .is_sleeping
389
391
390
392
def execute_dummy_batch (self ):
391
- self .model_executor .collective_rpc ( " execute_dummy_batch" )
393
+ self .model_executor .execute_dummy_batch ( )
392
394
393
395
def add_lora (self , lora_request : LoRARequest ) -> bool :
394
396
return self .model_executor .add_lora (lora_request )
@@ -733,7 +735,8 @@ def _process_input_queue(self):
733
735
"""Exits when an engine step needs to be performed."""
734
736
735
737
waited = False
736
- while not self .engines_running and not self .scheduler .has_requests ():
738
+ while not self .engines_running and not self .scheduler .has_requests () \
739
+ and not self .batch_queue :
737
740
if logger .isEnabledFor (DEBUG ) and self .input_queue .empty ():
738
741
logger .debug ("EngineCore waiting for work." )
739
742
waited = True
0 commit comments