1
1
import asyncio
2
2
import time
3
3
from functools import partial
4
- from typing import (AsyncIterator , Callable , Dict , Iterable , List , Optional ,
5
- Set , Tuple , Type , Union )
4
+ from typing import (AsyncIterator , Callable , Dict , Iterable , List , Mapping ,
5
+ Optional , Set , Tuple , Type , Union )
6
6
7
7
from transformers import PreTrainedTokenizer
8
8
@@ -151,7 +151,10 @@ def process_exception(self,
151
151
logger .info ("Finished request %s." , request_id )
152
152
self .abort_request (request_id )
153
153
154
- def add_request (self , request_id : str ,
154
+ def add_request (self ,
155
+ request_id : str ,
156
+ * ,
157
+ verbose : bool = False ,
155
158
** engine_add_request_kwargs ) -> AsyncStream :
156
159
"""Add a request to be sent to the engine on the next background
157
160
loop iteration."""
@@ -166,6 +169,9 @@ def add_request(self, request_id: str,
166
169
167
170
self .new_requests_event .set ()
168
171
172
+ if verbose :
173
+ logger .info ("Added request %s." , request_id )
174
+
169
175
return stream
170
176
171
177
def abort_request (self , request_id : str , * , verbose : bool = False ) -> None :
@@ -299,14 +305,14 @@ async def process_model_inputs_async(
299
305
return self .input_processor (llm_inputs )
300
306
301
307
async def add_request_async (
302
- self ,
303
- request_id : str ,
304
- inputs : PromptInputs ,
305
- params : Union [SamplingParams , PoolingParams ],
306
- arrival_time : Optional [float ] = None ,
307
- lora_request : Optional [LoRARequest ] = None ,
308
- trace_headers : Optional [Dict [str , str ]] = None ,
309
- prompt_adapter_request : Optional [PromptAdapterRequest ] = None
308
+ self ,
309
+ request_id : str ,
310
+ inputs : PromptInputs ,
311
+ params : Union [SamplingParams , PoolingParams ],
312
+ arrival_time : Optional [float ] = None ,
313
+ lora_request : Optional [LoRARequest ] = None ,
314
+ trace_headers : Optional [Mapping [str , str ]] = None ,
315
+ prompt_adapter_request : Optional [PromptAdapterRequest ] = None ,
310
316
) -> None :
311
317
if lora_request is not None and not self .lora_config :
312
318
raise ValueError (f"Got lora_request { lora_request } but LoRA is "
@@ -353,8 +359,6 @@ class AsyncLLMEngine:
353
359
async frontend will be executed in a separate process as the
354
360
model workers.
355
361
log_requests: Whether to log the requests.
356
- max_log_len: Maximum number of prompt characters or prompt ID numbers
357
- being printed in log.
358
362
start_engine_loop: If True, the background task to run the engine
359
363
will be automatically started in the generate call.
360
364
*args: Arguments for :class:`LLMEngine`.
@@ -368,13 +372,11 @@ def __init__(self,
368
372
engine_use_ray : bool ,
369
373
* args ,
370
374
log_requests : bool = True ,
371
- max_log_len : Optional [int ] = None ,
372
375
start_engine_loop : bool = True ,
373
376
** kwargs ) -> None :
374
377
self .worker_use_ray = worker_use_ray
375
378
self .engine_use_ray = engine_use_ray
376
379
self .log_requests = log_requests
377
- self .max_log_len = max_log_len
378
380
self .engine = self ._init_engine (* args , ** kwargs )
379
381
380
382
self .background_loop : Optional [asyncio .Future ] = None
@@ -468,7 +470,6 @@ def from_engine_args(
468
470
executor_class = executor_class ,
469
471
log_requests = not engine_args .disable_log_requests ,
470
472
log_stats = not engine_args .disable_log_stats ,
471
- max_log_len = engine_args .max_log_len ,
472
473
start_engine_loop = start_engine_loop ,
473
474
usage_context = usage_context ,
474
475
stat_loggers = stat_loggers ,
@@ -667,30 +668,9 @@ async def add_request(
667
668
params : Union [SamplingParams , PoolingParams ],
668
669
arrival_time : Optional [float ] = None ,
669
670
lora_request : Optional [LoRARequest ] = None ,
670
- trace_headers : Optional [Dict [str , str ]] = None ,
671
+ trace_headers : Optional [Mapping [str , str ]] = None ,
671
672
prompt_adapter_request : Optional [PromptAdapterRequest ] = None
672
673
) -> AsyncStream :
673
- if self .log_requests :
674
- if isinstance (inputs , str ):
675
- shortened_prompt = inputs
676
- shortened_token_ids = None
677
- else :
678
- shortened_prompt = inputs .get ("prompt" )
679
- shortened_token_ids = inputs .get ("prompt_token_ids" )
680
-
681
- max_log_len = self .max_log_len
682
- if max_log_len is not None :
683
- if shortened_prompt is not None :
684
- shortened_prompt = shortened_prompt [:max_log_len ]
685
- if shortened_token_ids is not None :
686
- shortened_token_ids = shortened_token_ids [:max_log_len ]
687
-
688
- logger .info (
689
- "Received request %s: prompt: %r, "
690
- "params: %s, prompt_token_ids: %s, "
691
- "lora_request: %s." , request_id , shortened_prompt , params ,
692
- shortened_token_ids , lora_request )
693
-
694
674
if not self .is_running :
695
675
if self .start_engine_loop :
696
676
self .start_background_loop ()
@@ -706,6 +686,7 @@ async def add_request(
706
686
707
687
stream = self ._request_tracker .add_request (
708
688
request_id ,
689
+ verbose = self .log_requests ,
709
690
inputs = inputs ,
710
691
params = params ,
711
692
arrival_time = arrival_time ,
@@ -721,7 +702,7 @@ async def generate(
721
702
sampling_params : SamplingParams ,
722
703
request_id : str ,
723
704
lora_request : Optional [LoRARequest ] = None ,
724
- trace_headers : Optional [Dict [str , str ]] = None ,
705
+ trace_headers : Optional [Mapping [str , str ]] = None ,
725
706
prompt_adapter_request : Optional [PromptAdapterRequest ] = None
726
707
) -> AsyncIterator [RequestOutput ]:
727
708
"""Generate outputs for a request.
@@ -804,7 +785,7 @@ async def encode(
804
785
pooling_params : PoolingParams ,
805
786
request_id : str ,
806
787
lora_request : Optional [LoRARequest ] = None ,
807
- trace_headers : Optional [Dict [str , str ]] = None ,
788
+ trace_headers : Optional [Mapping [str , str ]] = None ,
808
789
) -> AsyncIterator [EmbeddingRequestOutput ]:
809
790
"""Generate outputs for a request from an embedding model.
810
791
@@ -882,7 +863,7 @@ async def _process_request(
882
863
params : Union [SamplingParams , PoolingParams ],
883
864
* ,
884
865
lora_request : Optional [LoRARequest ] = None ,
885
- trace_headers : Optional [Dict [str , str ]] = None ,
866
+ trace_headers : Optional [Mapping [str , str ]] = None ,
886
867
prompt_adapter_request : Optional [PromptAdapterRequest ] = None ,
887
868
) -> AsyncIterator [Union [RequestOutput , EmbeddingRequestOutput ]]:
888
869
"""Common logic to process requests with SamplingParams or
0 commit comments