@@ -140,6 +140,52 @@ async def show_available_models(self) -> ModelList:
140140# Server entry points
141141
142142
143+ def _register_profiling_routes (app ) -> None :
144+ """Register /start_profile and /stop_profile directly on the app.
145+
146+ These are registered on the app (not the module-level router) to
147+ guarantee availability regardless of how vllm's build_app() handles
148+ router inclusion.
149+ """
150+
151+ @app .post ("/start_profile" )
152+ async def start_profile (raw_request : Request ) -> JSONResponse :
153+ """Start profiling on all stages.
154+
155+ When the server is running under nsys with
156+ ``--capture-range=cudaProfilerApi``, this also opens the CUDA
157+ profiler capture region.
158+ """
159+ engine_client = raw_request .app .state .engine_client
160+ try :
161+ await engine_client .start_profile ()
162+ except Exception as e :
163+ logger .exception ("Failed to start profile: %s" , e )
164+ raise HTTPException (
165+ status_code = HTTPStatus .INTERNAL_SERVER_ERROR .value ,
166+ detail = str (e ),
167+ ) from e
168+ return JSONResponse (content = {"status" : "ok" })
169+
170+ @app .post ("/stop_profile" )
171+ async def stop_profile (raw_request : Request ) -> JSONResponse :
172+ """Stop profiling on all stages.
173+
174+ When running under nsys, this closes the CUDA profiler capture
175+ region so nsys finalises the current capture.
176+ """
177+ engine_client = raw_request .app .state .engine_client
178+ try :
179+ await engine_client .stop_profile ()
180+ except Exception as e :
181+ logger .exception ("Failed to stop profile: %s" , e )
182+ raise HTTPException (
183+ status_code = HTTPStatus .INTERNAL_SERVER_ERROR .value ,
184+ detail = str (e ),
185+ ) from e
186+ return JSONResponse (content = {"status" : "ok" })
187+
188+
143189async def omni_run_server (args , ** uvicorn_kwargs ) -> None :
144190 """Run a single-worker API server.
145191
@@ -183,6 +229,10 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None,
183229 ) as engine_client :
184230 app = build_app (args )
185231
232+ # Register profiling endpoints directly on the app so they are
233+ # available regardless of how vllm's build_app handles routers.
234+ _register_profiling_routes (app )
235+
186236 await omni_init_app_state (engine_client , app .state , args )
187237
188238 vllm_config = await engine_client .get_vllm_config ()
@@ -736,45 +786,6 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
736786 return StreamingResponse (content = generator , media_type = "text/event-stream" )
737787
738788
739- @router .post ("/start_profile" )
740- async def start_profile (raw_request : Request ) -> JSONResponse :
741- """Start profiling on all stages.
742-
743- When the server is running under nsys with
744- ``--capture-range=cudaProfilerApi``, this also opens the CUDA
745- profiler capture region.
746- """
747- engine_client = raw_request .app .state .engine_client
748- try :
749- await engine_client .start_profile ()
750- except Exception as e :
751- logger .exception ("Failed to start profile: %s" , e )
752- raise HTTPException (
753- status_code = HTTPStatus .INTERNAL_SERVER_ERROR .value ,
754- detail = str (e ),
755- ) from e
756- return JSONResponse (content = {"status" : "ok" })
757-
758-
759- @router .post ("/stop_profile" )
760- async def stop_profile (raw_request : Request ) -> JSONResponse :
761- """Stop profiling on all stages.
762-
763- When running under nsys, this closes the CUDA profiler capture
764- region so nsys finalises the current capture.
765- """
766- engine_client = raw_request .app .state .engine_client
767- try :
768- await engine_client .stop_profile ()
769- except Exception as e :
770- logger .exception ("Failed to stop profile: %s" , e )
771- raise HTTPException (
772- status_code = HTTPStatus .INTERNAL_SERVER_ERROR .value ,
773- detail = str (e ),
774- ) from e
775- return JSONResponse (content = {"status" : "ok" })
776-
777-
778789_remove_route_from_router (router , "/v1/audio/speech" , {"POST" })
779790
780791
0 commit comments