Skip to content

Commit 81b8af8

Browse files
authored
fix: update router standalone to use updated vLLM API (#4079)
Signed-off-by: Sean Choi <[email protected]>
1 parent 98842a0 commit 81b8af8

File tree

3 files changed

+45
-20
lines changed

3 files changed

+45
-20
lines changed

examples/deployments/router_standalone/api.py

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
)
3636
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
3737
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
38+
from vllm.inputs.data import TokensPrompt
3839
from vllm.transformers_utils.tokenizer import get_tokenizer
3940
from worker import VllmWorkers
4041

@@ -78,9 +79,11 @@ async def chat_completions(request: ChatCompletionRequest):
7879
or self.http_client is None
7980
):
8081
return ErrorResponse(
81-
message="Service not ready",
82-
type="service_unavailable",
83-
code=503,
82+
error={
83+
"message": "Service not ready",
84+
"type": "service_unavailable",
85+
"code": 503,
86+
},
8487
)
8588

8689
try:
@@ -95,9 +98,11 @@ async def chat_completions(request: ChatCompletionRequest):
9598
max_tokens_value = request.max_tokens
9699
else:
97100
return ErrorResponse(
98-
message="Either max_tokens or max_completion_tokens must be specified",
99-
type="invalid_request_error",
100-
code=400,
101+
error={
102+
"message": "Either max_tokens or max_completion_tokens must be specified",
103+
"type": "invalid_request_error",
104+
"code": 400,
105+
},
101106
)
102107

103108
# Use vLLM's preprocessing to convert chat to prompt
@@ -119,19 +124,21 @@ async def chat_completions(request: ChatCompletionRequest):
119124

120125
# Convert request to sampling parameters with our determined max_tokens
121126
sampling_params = request.to_sampling_params(
122-
default_max_tokens=max_tokens_value,
127+
max_tokens=max_tokens_value,
123128
logits_processor_pattern=None,
124-
default_sampling_params=None,
129+
default_sampling_params={},
125130
)
126131

127132
# Get best worker using HTTP request to router
128133
tokens: list[int] = engine_prompt["prompt_token_ids"]
129134
num_tokens = len(tokens)
130135
if num_tokens == 0:
131136
return ErrorResponse(
132-
message="Input prompt is empty",
133-
type="invalid_request_error",
134-
code=400,
137+
error={
138+
"message": "Input prompt is empty",
139+
"type": "invalid_request_error",
140+
"code": 400,
141+
}
135142
)
136143

137144
# It is much preferred to communicate block hashes to the router instead of
@@ -161,9 +168,11 @@ async def chat_completions(request: ChatCompletionRequest):
161168
except (httpx.RequestError, httpx.HTTPStatusError) as e:
162169
logger.error(f"Router request failed: {e}")
163170
return ErrorResponse(
164-
message="Router service unavailable",
165-
type="service_unavailable",
166-
code=503,
171+
error={
172+
"message": "Router service unavailable",
173+
"type": "service_unavailable",
174+
"code": 503,
175+
}
167176
)
168177

169178
logger.info(f"Selected worker {best_worker_id} for request")
@@ -172,9 +181,13 @@ async def chat_completions(request: ChatCompletionRequest):
172181
request_id = f"chatcmpl-{uuid.uuid4()}"
173182
request_metadata = RequestResponseMetadata(request_id=request_id)
174183

184+
# Convert engine_prompt dict to TokensPrompt object
185+
tokens_prompt = TokensPrompt(prompt_token_ids=tokens)
186+
logger.info(f"Created TokensPrompt with {len(tokens)} tokens")
187+
175188
# Get the generator from the selected worker with sampling params
176189
result_generator = self.workers.direct(
177-
engine_prompt, best_worker_id, sampling_params
190+
tokens_prompt, best_worker_id, sampling_params
178191
)
179192
assert request.stream
180193

@@ -188,14 +201,17 @@ async def chat_completions(request: ChatCompletionRequest):
188201
conversation,
189202
self.tokenizer,
190203
request_metadata,
204+
enable_force_include_usage=False,
191205
),
192206
media_type="text/event-stream",
193207
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
194208
)
195209

196210
except Exception as e:
197211
logger.error(f"Error processing request: {e}")
198-
return ErrorResponse(message=str(e), type="internal_error", code=500)
212+
return ErrorResponse(
213+
error={"message": str(e), "type": "internal_error", "code": 500}
214+
)
199215

200216
async def initialize_services(self):
201217
"""Initialize workers, HTTP client, and OpenAI serving components"""

examples/deployments/router_standalone/router.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class RouterResponse(BaseModel):
4141

4242

4343
class LoadMetrics(BaseModel):
44-
gpu_cache_usage: float
44+
kv_cache_usage: float
4545
num_waiting_reqs: int
4646

4747

@@ -101,7 +101,7 @@ async def update_load(worker_id: int):
101101
try:
102102
metrics_dict = self.load_listeners[worker_id].recv_json(zmq.NOBLOCK)
103103
metrics = LoadMetrics.model_validate(metrics_dict)
104-
self.kv_usages[worker_id] = metrics.gpu_cache_usage
104+
self.kv_usages[worker_id] = metrics.kv_cache_usage
105105
self.waitings[worker_id] = metrics.num_waiting_reqs
106106
except zmq.Again:
107107
pass

examples/deployments/router_standalone/worker.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,13 @@
2020
from typing import AsyncGenerator, Optional
2121

2222
import zmq
23-
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
23+
from vllm.config import (
24+
CacheConfig,
25+
ModelConfig,
26+
ObservabilityConfig,
27+
SchedulerConfig,
28+
VllmConfig,
29+
)
2430
from vllm.distributed.kv_events import KVEventsConfig
2531
from vllm.inputs.data import TokensPrompt
2632
from vllm.outputs import RequestOutput
@@ -50,7 +56,7 @@ def record(
5056
# Send metrics over ZMQ
5157
metrics_data = {
5258
"num_waiting_reqs": scheduler_stats.num_waiting_reqs,
53-
"gpu_cache_usage": scheduler_stats.gpu_cache_usage,
59+
"kv_cache_usage": scheduler_stats.kv_cache_usage,
5460
}
5561

5662
self.socket.send_json(metrics_data)
@@ -108,11 +114,14 @@ def __init__(
108114
scheduler_cls="vllm.v1.core.sched.scheduler.Scheduler"
109115
)
110116

117+
observability_config = ObservabilityConfig()
118+
111119
vllm_config = VllmConfig(
112120
model_config=model_config,
113121
cache_config=cache_config,
114122
kv_events_config=kv_events_config,
115123
scheduler_config=scheduler_config,
124+
observability_config=observability_config,
116125
)
117126

118127
self.llms.append(

0 commit comments

Comments
 (0)