Skip to content

Commit 84ebfcb

Browse files
ahengljhclaude
andcommitted
[Profiler] Fix 404 by registering profile routes on app directly
Move /start_profile and /stop_profile from the module-level router to direct app registration via _register_profiling_routes(), called after build_app() returns. This ensures the routes exist on the app regardless of how vllm's build_app() handles router inclusion. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Jinheng Li <ahengljh@gmail.com>
1 parent ce8a8a5 commit 84ebfcb

File tree

1 file changed

+50
-39
lines changed

1 file changed

+50
-39
lines changed

vllm_omni/entrypoints/openai/api_server.py

Lines changed: 50 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,52 @@ async def show_available_models(self) -> ModelList:
140140
# Server entry points
141141

142142

143+
def _register_profiling_routes(app) -> None:
144+
"""Register /start_profile and /stop_profile directly on the app.
145+
146+
These are registered on the app (not the module-level router) to
147+
guarantee availability regardless of how vllm's build_app() handles
148+
router inclusion.
149+
"""
150+
151+
@app.post("/start_profile")
152+
async def start_profile(raw_request: Request) -> JSONResponse:
153+
"""Start profiling on all stages.
154+
155+
When the server is running under nsys with
156+
``--capture-range=cudaProfilerApi``, this also opens the CUDA
157+
profiler capture region.
158+
"""
159+
engine_client = raw_request.app.state.engine_client
160+
try:
161+
await engine_client.start_profile()
162+
except Exception as e:
163+
logger.exception("Failed to start profile: %s", e)
164+
raise HTTPException(
165+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
166+
detail=str(e),
167+
) from e
168+
return JSONResponse(content={"status": "ok"})
169+
170+
@app.post("/stop_profile")
171+
async def stop_profile(raw_request: Request) -> JSONResponse:
172+
"""Stop profiling on all stages.
173+
174+
When running under nsys, this closes the CUDA profiler capture
175+
region so nsys finalises the current capture.
176+
"""
177+
engine_client = raw_request.app.state.engine_client
178+
try:
179+
await engine_client.stop_profile()
180+
except Exception as e:
181+
logger.exception("Failed to stop profile: %s", e)
182+
raise HTTPException(
183+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
184+
detail=str(e),
185+
) from e
186+
return JSONResponse(content={"status": "ok"})
187+
188+
143189
async def omni_run_server(args, **uvicorn_kwargs) -> None:
144190
"""Run a single-worker API server.
145191
@@ -183,6 +229,10 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None,
183229
) as engine_client:
184230
app = build_app(args)
185231

232+
# Register profiling endpoints directly on the app so they are
233+
# available regardless of how vllm's build_app handles routers.
234+
_register_profiling_routes(app)
235+
186236
await omni_init_app_state(engine_client, app.state, args)
187237

188238
vllm_config = await engine_client.get_vllm_config()
@@ -736,45 +786,6 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
736786
return StreamingResponse(content=generator, media_type="text/event-stream")
737787

738788

739-
@router.post("/start_profile")
740-
async def start_profile(raw_request: Request) -> JSONResponse:
741-
"""Start profiling on all stages.
742-
743-
When the server is running under nsys with
744-
``--capture-range=cudaProfilerApi``, this also opens the CUDA
745-
profiler capture region.
746-
"""
747-
engine_client = raw_request.app.state.engine_client
748-
try:
749-
await engine_client.start_profile()
750-
except Exception as e:
751-
logger.exception("Failed to start profile: %s", e)
752-
raise HTTPException(
753-
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
754-
detail=str(e),
755-
) from e
756-
return JSONResponse(content={"status": "ok"})
757-
758-
759-
@router.post("/stop_profile")
760-
async def stop_profile(raw_request: Request) -> JSONResponse:
761-
"""Stop profiling on all stages.
762-
763-
When running under nsys, this closes the CUDA profiler capture
764-
region so nsys finalises the current capture.
765-
"""
766-
engine_client = raw_request.app.state.engine_client
767-
try:
768-
await engine_client.stop_profile()
769-
except Exception as e:
770-
logger.exception("Failed to stop profile: %s", e)
771-
raise HTTPException(
772-
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
773-
detail=str(e),
774-
) from e
775-
return JSONResponse(content={"status": "ok"})
776-
777-
778789
_remove_route_from_router(router, "/v1/audio/speech", {"POST"})
779790

780791

0 commit comments

Comments
 (0)