14
14
from vllm import (AsyncLLMEngine , CompletionOutput , RequestOutput ,
15
15
SamplingParams )
16
16
from vllm .config import ModelConfig
17
- from vllm .entrypoints .grpc .pb import generation_pb2_grpc
17
+ from vllm .entrypoints .grpc .pb import generation_pb2_grpc # type: ignore
18
18
# yapf: disable
19
19
from vllm .entrypoints .grpc .pb .generation_pb2 import (BatchedGenerationRequest ,
20
20
BatchedGenerationResponse ,
@@ -54,15 +54,15 @@ async def _handle_exception(e: Exception, func, *args, **kwargs):
54
54
if not isinstance (e , AbortError ):
55
55
if type (e ).__name__ == "torch.cuda.OutOfMemoryError" : #TODO check
56
56
context = kwargs .get ("context" , None ) or args [- 1 ]
57
- logger .exception (f" { func . __name__ } caused GPU OOM error" )
57
+ logger .exception ("%s caused GPU OOM error", func . __name__ )
58
58
service_metrics .count_request_failure (FailureReasonLabel .OOM )
59
59
await context .abort (StatusCode .RESOURCE_EXHAUSTED , str (e ))
60
60
else :
61
61
if "generate" in func .__name__ .lower ():
62
62
service_metrics .count_request_failure (FailureReasonLabel .GENERATE )
63
63
else :
64
64
service_metrics .count_request_failure (FailureReasonLabel .UNKNOWN )
65
- logger .exception (f" { func . __name__ } failed" )
65
+ logger .exception ("%s failed", func . __name__ )
66
66
raise e
67
67
68
68
@@ -295,7 +295,7 @@ def _convert_output(self,
295
295
text = output .text [text_start_offset :],
296
296
generated_token_count = len (output .token_ids ),
297
297
stop_reason = stop_reason ,
298
- stop_sequence = stop_sequence ,
298
+ stop_sequence = stop_sequence if stop_sequence else '' ,
299
299
)
300
300
301
301
if resp_options .generated_tokens :
@@ -413,7 +413,8 @@ async def _validate_and_convert_params(
413
413
414
414
@staticmethod
415
415
def _convert_reason (output : CompletionOutput , max_is_token_limit : bool ,
416
- time_limit_reached : bool ) -> Tuple ['StopReason' , str ]:
416
+ time_limit_reached : bool
417
+ ) -> Tuple [StopReason .ValueType , Optional [str ]]:
417
418
finish_reason = output .finish_reason
418
419
stop_sequence = None
419
420
if finish_reason is None :
@@ -433,20 +434,20 @@ def _convert_reason(output: CompletionOutput, max_is_token_limit: bool,
433
434
stop_sequence = stop_str_or_tok
434
435
else :
435
436
logger .warning (
436
- f "Unexpected stop_reason type: { type (stop_str_or_tok )} "
437
+ "Unexpected stop_reason type: %s" , type (stop_str_or_tok )
437
438
)
438
439
elif finish_reason == "abort" :
439
440
stop_reason = StopReason .CANCELLED
440
441
else :
441
- logger .warning (f "Unrecognized finish_reason: { finish_reason } " )
442
+ logger .warning ("Unrecognized finish_reason: %s" , finish_reason )
442
443
stop_reason = StopReason .CANCELLED
443
444
444
445
return stop_reason , stop_sequence
445
446
446
447
def _convert_tokens (
447
448
self ,
448
- token_ids : list [int ],
449
- logprobs_list : Optional [list [Dict [int , Logprob ]]],
449
+ token_ids : List [int ],
450
+ logprobs_list : Optional [List [Dict [int , Logprob ]]],
450
451
include_logprobs : bool ,
451
452
include_ranks : bool ,
452
453
top_n_tokens : int ,
@@ -499,7 +500,7 @@ async def _validate_prompt_and_tokenize(
499
500
# "max_length": truncate_input_tokens} \
500
501
# if truncate_input_tokens is not None else {
501
502
# "truncation": True, "max_length": max_model_len + 1}
502
- tokenize_kwargs = {}
503
+ tokenize_kwargs : Dict [ str , Any ] = {}
503
504
504
505
input_ids = await self .tokenizer_group .encode_async (
505
506
prompt , ** tokenize_kwargs )
@@ -661,6 +662,6 @@ async def start_grpc_server(engine: AsyncLLMEngine,
661
662
server .add_insecure_port (listen_on )
662
663
663
664
await server .start ()
664
- logger .info (f "gRPC Server started at { listen_on } " )
665
+ logger .info ("gRPC Server started at %s" , listen_on )
665
666
666
667
return server
0 commit comments