Skip to content

Commit 69ab0b1

Browse files
[Fix]Add pd routing error handle (#585)
* [CI] Add prefix aware routing test Signed-off-by: Rui Zhang <[email protected]> * [ci] refactor k8s discovery e2e test Signed-off-by: Rui Zhang <[email protected]> * [CI] Refactor static discovery testing so that it can support multiple logic Signed-off-by: Rui Zhang <[email protected]> * [CI] Add static e2e test for prefixaware Signed-off-by: Rui Zhang <[email protected]> * refactor the code Signed-off-by: Rui Zhang <[email protected]> * [CI] refactor Signed-off-by: Rui Zhang <[email protected]> * [CI] Add multiple routing logic test Signed-off-by: Rui Zhang <[email protected]> * [CI] fix bug Signed-off-by: Rui Zhang <[email protected]> * hotfix/add error handle in pd routing Signed-off-by: Rui Zhang <[email protected]> * modify Signed-off-by: Rui Zhang <[email protected]> --------- Signed-off-by: Rui Zhang <[email protected]>
1 parent 5cb150e commit 69ab0b1

File tree

1 file changed

+67
-13
lines changed
  • src/vllm_router/services/request_service

1 file changed

+67
-13
lines changed

src/vllm_router/services/request_service/request.py

Lines changed: 67 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -357,21 +357,75 @@ async def route_disaggregated_prefill_request(
357357
orig_max_tokens = request_json.get("max_tokens", 0)
358358
request_json["max_tokens"] = 1
359359
st = time.time()
360-
await send_request_to_prefiller(
361-
request.app.state.prefill_client, endpoint, request_json, request_id
362-
)
363-
et = time.time()
364-
logger.info(f"{request_id} prefill time (TTFT): {et - st:.4f}")
365-
logger.info(
366-
f"Routing request {request_id} with session id None to {request.app.state.prefill_client.base_url} at {et}, process time = {et - in_router_time:.4f}"
367-
)
368-
request_json["max_tokens"] = orig_max_tokens
360+
try:
361+
await send_request_to_prefiller(
362+
request.app.state.prefill_client, endpoint, request_json, request_id
363+
)
364+
et = time.time()
365+
logger.info(f"{request_id} prefill time (TTFT): {et - st:.4f}")
366+
logger.info(
367+
f"Routing request {request_id} with session id None to {request.app.state.prefill_client.base_url} at {et}, process time = {et - in_router_time:.4f}"
368+
)
369+
request_json["max_tokens"] = orig_max_tokens
370+
except httpx.HTTPStatusError as e:
371+
logger.error(f"HTTP error in prefiller: {e}", exc_info=True)
372+
return JSONResponse(
373+
status_code=e.response.status_code,
374+
content={
375+
"error": {
376+
"message": f"Prefiller error: {e.response.text}",
377+
"type": "prefiller_error",
378+
"code": e.response.status_code,
379+
}
380+
},
381+
headers={"X-Request-Id": request_id},
382+
)
383+
except Exception as e:
384+
logger.error(f"Unexpected error in prefiller: {e}", exc_info=True)
385+
return JSONResponse(
386+
status_code=500,
387+
content={
388+
"error": {
389+
"message": f"Prefiller error: {str(e)}",
390+
"type": "prefiller_error",
391+
"code": 500,
392+
}
393+
},
394+
headers={"X-Request-Id": request_id},
395+
)
369396

370397
async def generate_stream():
371-
async for chunk in send_request_to_decode(
372-
request.app.state.decode_client, endpoint, request_json, request_id
373-
):
374-
yield chunk
398+
try:
399+
async for chunk in send_request_to_decode(
400+
request.app.state.decode_client, endpoint, request_json, request_id
401+
):
402+
yield chunk
403+
except httpx.HTTPStatusError as e:
404+
logger.error(f"HTTP error in decoder: {e}", exc_info=True)
405+
try:
406+
error_text = e.response.text
407+
except Exception:
408+
error_text = f"HTTP {e.response.status_code}"
409+
# Yield error as JSON response
410+
error_response = {
411+
"error": {
412+
"message": f"Decoder error: {error_text}",
413+
"type": "decoder_error",
414+
"code": e.response.status_code,
415+
}
416+
}
417+
yield json.dumps(error_response).encode("utf-8")
418+
except Exception as e:
419+
logger.error(f"Unexpected error in decoder: {e}", exc_info=True)
420+
# Yield error as JSON response
421+
error_response = {
422+
"error": {
423+
"message": f"Decoder error: {str(e)}",
424+
"type": "decoder_error",
425+
"code": 500,
426+
}
427+
}
428+
yield json.dumps(error_response).encode("utf-8")
375429

376430
curr_time = time.time()
377431
logger.info(

0 commit comments

Comments
 (0)