Skip to content

Commit 7ce50b0

Browse files
committed
feat(router): make timeout customizable
30 second timeout per model is enough for most people but somebody might use a very big model or use slow machines so it should also work for this use case. Customizing the health check interval will be done in another MR. Signed-off-by: Max Wittig <[email protected]>
1 parent 32319c3 commit 7ce50b0

File tree

5 files changed

+19
-4
lines changed

5 files changed

+19
-4
lines changed

src/tests/test_static_service_discovery.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def test_init_when_static_backend_health_checks_calls_start_health_checks(
2121
None,
2222
None,
2323
static_backend_health_checks=True,
24+
static_backend_health_checks_timeout=30,
2425
prefill_model_labels=None,
2526
decode_model_labels=None,
2627
)
@@ -43,6 +44,7 @@ def test_init_when_endpoint_health_check_disabled_does_not_call_start_health_che
4344
None,
4445
None,
4546
static_backend_health_checks=False,
47+
static_backend_health_checks_timeout=30,
4648
prefill_model_labels=None,
4749
decode_model_labels=None,
4850
)
@@ -61,6 +63,7 @@ def test_get_unhealthy_endpoint_hashes_when_only_healthy_models_exist_does_not_r
6163
None,
6264
["chat"],
6365
static_backend_health_checks=True,
66+
static_backend_health_checks_timeout=30,
6467
prefill_model_labels=None,
6568
decode_model_labels=None,
6669
)
@@ -79,6 +82,7 @@ def test_get_unhealthy_endpoint_hashes_when_unhealthy_model_exist_returns_unheal
7982
None,
8083
["chat"],
8184
static_backend_health_checks=False,
85+
static_backend_health_checks_timeout=30,
8286
prefill_model_labels=None,
8387
decode_model_labels=None,
8488
)
@@ -92,7 +96,7 @@ def test_get_unhealthy_endpoint_hashes_when_healthy_and_unhealthy_models_exist_r
9296
) -> None:
9397
unhealthy_model = "bge-m3"
9498

95-
def mock_is_model_healthy(url: str, model: str, model_type: str) -> bool:
99+
def mock_is_model_healthy(url: str, model: str, model_type: str, timeout: int = 30) -> bool:
96100
return model != unhealthy_model
97101

98102
monkeypatch.setattr("vllm_router.utils.is_model_healthy", mock_is_model_healthy)
@@ -104,6 +108,7 @@ def mock_is_model_healthy(url: str, model: str, model_type: str) -> bool:
104108
None,
105109
["chat", "embeddings"],
106110
static_backend_health_checks=False,
111+
static_backend_health_checks_timeout=30,
107112
prefill_model_labels=None,
108113
decode_model_labels=None,
109114
)
@@ -128,6 +133,7 @@ def mock_get_model_endpoint_hash(url: str, model: str) -> str:
128133
None,
129134
["chat", "chat"],
130135
static_backend_health_checks=False,
136+
static_backend_health_checks_timeout=30,
131137
prefill_model_labels=None,
132138
decode_model_labels=None,
133139
)

src/vllm_router/app.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ def initialize_all(app: FastAPI, args):
149149
else None
150150
),
151151
static_backend_health_checks=args.static_backend_health_checks,
152+
static_backend_health_checks_timeout=args.static_backend_health_checks_timeout,
152153
prefill_model_labels=args.prefill_model_labels,
153154
decode_model_labels=args.decode_model_labels,
154155
)

src/vllm_router/parsers/parser.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,12 @@ def parse_args():
156156
action="store_true",
157157
help="Enable this flag to make vllm-router check periodically if the models work by sending dummy requests to their endpoints.",
158158
)
159+
parser.add_argument(
160+
"--static-backend-health-checks-timeout",
161+
type=int,
162+
help="Timeout in seconds for dummy requests sent using the static backend health check. Defaults to 30.",
163+
default=30,
164+
)
159165
parser.add_argument(
160166
"--k8s-port",
161167
type=int,

src/vllm_router/service_discovery.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ def __init__(
210210
model_labels: List[str] | None,
211211
model_types: List[str] | None,
212212
static_backend_health_checks: bool,
213+
static_backend_health_checks_timeout: int,
213214
prefill_model_labels: List[str] | None,
214215
decode_model_labels: List[str] | None,
215216
):
@@ -223,6 +224,7 @@ def __init__(
223224
self.engines_id = [str(uuid.uuid4()) for i in range(0, len(urls))]
224225
self.added_timestamp = int(time.time())
225226
self.unhealthy_endpoint_hashes = []
227+
self.static_backend_health_checks_timeout = static_backend_health_checks_timeout
226228
if static_backend_health_checks:
227229
self.start_health_check_task()
228230
self.prefill_model_labels = prefill_model_labels
@@ -231,7 +233,7 @@ def __init__(
231233
def get_unhealthy_endpoint_hashes(self) -> list[str]:
232234
unhealthy_endpoints = []
233235
for url, model, model_type in zip(self.urls, self.models, self.model_types):
234-
if utils.is_model_healthy(url, model, model_type):
236+
if utils.is_model_healthy(url, model, model_type, self.static_backend_health_checks_timeout):
235237
logger.debug(f"{model} at {url} is healthy")
236238
else:
237239
logger.warning(f"{model} at {url} not healthy!")

src/vllm_router/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,14 @@ def update_content_length(request: Request, request_body: str):
157157
request._headers = headers
158158

159159

160-
def is_model_healthy(url: str, model: str, model_type: str) -> bool:
160+
def is_model_healthy(url: str, model: str, model_type: str, timeout: int = 30) -> bool:
161161
model_details = ModelType[model_type]
162162
try:
163163
response = requests.post(
164164
f"{url}{model_details.value}",
165165
headers={"Content-Type": "application/json"},
166166
json={"model": model} | model_details.get_test_payload(model_type),
167-
timeout=30,
167+
timeout=timeout,
168168
)
169169
except Exception as e:
170170
logger.error(e)

0 commit comments

Comments
 (0)