3535)
3636from vllm .entrypoints .openai .serving_chat import OpenAIServingChat
3737from vllm .entrypoints .openai .serving_models import BaseModelPath , OpenAIServingModels
38+ from vllm .inputs .data import TokensPrompt
3839from vllm .transformers_utils .tokenizer import get_tokenizer
3940from worker import VllmWorkers
4041
@@ -78,9 +79,11 @@ async def chat_completions(request: ChatCompletionRequest):
7879 or self .http_client is None
7980 ):
8081 return ErrorResponse (
81- message = "Service not ready" ,
82- type = "service_unavailable" ,
83- code = 503 ,
82+ error = {
83+ "message" : "Service not ready" ,
84+ "type" : "service_unavailable" ,
85+ "code" : 503 ,
86+ },
8487 )
8588
8689 try :
@@ -95,9 +98,11 @@ async def chat_completions(request: ChatCompletionRequest):
9598 max_tokens_value = request .max_tokens
9699 else :
97100 return ErrorResponse (
98- message = "Either max_tokens or max_completion_tokens must be specified" ,
99- type = "invalid_request_error" ,
100- code = 400 ,
101+ error = {
102+ "message" : "Either max_tokens or max_completion_tokens must be specified" ,
103+ "type" : "invalid_request_error" ,
104+ "code" : 400 ,
105+ },
101106 )
102107
103108 # Use vLLM's preprocessing to convert chat to prompt
@@ -119,19 +124,21 @@ async def chat_completions(request: ChatCompletionRequest):
119124
120125 # Convert request to sampling parameters with our determined max_tokens
121126 sampling_params = request .to_sampling_params (
122- default_max_tokens = max_tokens_value ,
127+ max_tokens = max_tokens_value ,
123128 logits_processor_pattern = None ,
124- default_sampling_params = None ,
129+ default_sampling_params = {} ,
125130 )
126131
127132 # Get best worker using HTTP request to router
128133 tokens : list [int ] = engine_prompt ["prompt_token_ids" ]
129134 num_tokens = len (tokens )
130135 if num_tokens == 0 :
131136 return ErrorResponse (
132- message = "Input prompt is empty" ,
133- type = "invalid_request_error" ,
134- code = 400 ,
137+ error = {
138+ "message" : "Input prompt is empty" ,
139+ "type" : "invalid_request_error" ,
140+ "code" : 400 ,
141+ }
135142 )
136143
137144 # It is much preferred to communicate block hashes to the router instead of
@@ -161,9 +168,11 @@ async def chat_completions(request: ChatCompletionRequest):
161168 except (httpx .RequestError , httpx .HTTPStatusError ) as e :
162169 logger .error (f"Router request failed: { e } " )
163170 return ErrorResponse (
164- message = "Router service unavailable" ,
165- type = "service_unavailable" ,
166- code = 503 ,
171+ error = {
172+ "message" : "Router service unavailable" ,
173+ "type" : "service_unavailable" ,
174+ "code" : 503 ,
175+ }
167176 )
168177
169178 logger .info (f"Selected worker { best_worker_id } for request" )
@@ -172,9 +181,13 @@ async def chat_completions(request: ChatCompletionRequest):
172181 request_id = f"chatcmpl-{ uuid .uuid4 ()} "
173182 request_metadata = RequestResponseMetadata (request_id = request_id )
174183
184+ # Convert engine_prompt dict to TokensPrompt object
185+ tokens_prompt = TokensPrompt (prompt_token_ids = tokens )
186+ logger .info (f"Created TokensPrompt with { len (tokens )} tokens" )
187+
175188 # Get the generator from the selected worker with sampling params
176189 result_generator = self .workers .direct (
177- engine_prompt , best_worker_id , sampling_params
190+ tokens_prompt , best_worker_id , sampling_params
178191 )
179192 assert request .stream
180193
@@ -188,14 +201,17 @@ async def chat_completions(request: ChatCompletionRequest):
188201 conversation ,
189202 self .tokenizer ,
190203 request_metadata ,
204+ enable_force_include_usage = False ,
191205 ),
192206 media_type = "text/event-stream" ,
193207 headers = {"Cache-Control" : "no-cache" , "Connection" : "keep-alive" },
194208 )
195209
196210 except Exception as e :
197211 logger .error (f"Error processing request: { e } " )
198- return ErrorResponse (message = str (e ), type = "internal_error" , code = 500 )
212+ return ErrorResponse (
213+ error = {"message" : str (e ), "type" : "internal_error" , "code" : 500 }
214+ )
199215
200216 async def initialize_services (self ):
201217 """Initialize workers, HTTP client, and OpenAI serving components"""
0 commit comments