[Profiler] Add Nsight Systems support for online serving

ahengljh · claude · ahengljh · commit b23aa54006d7 · 2026-01-30T11:40:11.000+08:00
Add CudaProfiler class and HTTP /start_profile, /stop_profile endpoints
so that nsys can capture GPU-level traces during online serving via the
cudaProfilerApi capture range. Both sync and async stage workers now call
torch.cuda.profiler.start()/stop() alongside the existing torch profiler.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
@@ -132,9 +132,55 @@ python image_to_video.py \
 2. **Wan-AI/Wan2.2-I2V-A14B-Diffusers**:   [https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video)
 
 > **Note:**
-As of now, asynchronous (online) profiling is not fully supported in vLLM-Omni. While start_profile() and stop_profile() methods exist, they are only reliable in offline inference scripts (e.g., the provided end2end.py examples). Do not use them in server-mode or streaming scenarios—traces may be incomplete or fail to flush.
+The PyTorch Profiler (`start_profile()` / `stop_profile()`) is primarily designed for offline inference scripts. For online (server-mode) profiling, use Nsight Systems as described below.
 
-### 4. Analyzing Omni Traces
+### 4. Nsight Systems Profiling for Online Serving
+
+NVIDIA Nsight Systems (`nsys`) can capture GPU-level traces while the server is running. The API server exposes `/start_profile` and `/stop_profile` HTTP endpoints that signal nsys via `torch.cuda.profiler.start()` / `stop()`.
+
+**Step 1 — Launch the server under nsys:**
+
+```bash
+nsys profile \
+  --capture-range=cudaProfilerApi \
+  --capture-range-end=repeat \
+  --trace-fork-before-exec=true \
+  --cuda-graph-trace=node \
+  vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
+```
+
+`--capture-range=cudaProfilerApi` tells nsys to sit idle until `torch.cuda.profiler.start()` is called in a worker process. `--capture-range-end=repeat` allows multiple start/stop cycles in the same session.
+
+**Step 2 — Start profiling:**
+
+```bash
+curl -X POST http://localhost:8091/start_profile
+```
+
+**Step 3 — Send requests:**
+
+```bash
+curl -X POST http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model":"Qwen/Qwen2.5-Omni-7B","messages":[{"role":"user","content":"Hello"}]}'
+```
+
+**Step 4 — Stop profiling:**
+
+```bash
+curl -X POST http://localhost:8091/stop_profile
+```
+
+**Step 5 — Shut down the server** (Ctrl+C). nsys writes a `.nsys-rep` file in the current directory.
+
+```bash
+ls *.nsys-rep
+nsys stats report1.nsys-rep
+```
+
+Open the `.nsys-rep` file in the Nsight Systems GUI for a detailed timeline of CUDA kernels, memory operations, and NVTX ranges.
+
+### 5. Analyzing Omni Traces
 
 Output files are saved to your configured ```VLLM_TORCH_PROFILER_DIR```.
 
diff --git a/vllm_omni/diffusion/profiler/__init__.py b/vllm_omni/diffusion/profiler/__init__.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .cuda_profiler import CudaProfiler
 from .torch_profiler import TorchProfiler
 
 # Default profiler – can be changed later via config
 CurrentProfiler = TorchProfiler
 
-__all__ = ["CurrentProfiler", "TorchProfiler"]
+__all__ = ["CudaProfiler", "CurrentProfiler", "TorchProfiler"]
diff --git a/vllm_omni/diffusion/profiler/cuda_profiler.py b/vllm_omni/diffusion/profiler/cuda_profiler.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import nullcontext
+
+import torch
+from vllm.logger import init_logger
+
+from .base import ProfilerBase
+
+logger = init_logger(__name__)
+
+
+class CudaProfiler(ProfilerBase):
+    """
+    Lightweight profiler that signals nsys via the CUDA Profiler API.
+
+    When the server is launched under ``nsys profile
+    --capture-range=cudaProfilerApi``, calling ``start()`` /
+    ``stop()`` brackets the region that nsys will capture.  No trace
+    files are written by this class — nsys handles all tracing
+    externally and produces a ``.nsys-rep`` file on process exit.
+    """
+
+    _active: bool = False
+
+    @classmethod
+    def start(cls, trace_path_template: str = "") -> str:
+        """Start the CUDA profiler range for nsys capture."""
+        if cls._active:
+            logger.warning("[Rank %s] CUDA profiler already active", cls._get_rank())
+            return ""
+        torch.cuda.profiler.start()
+        cls._active = True
+        logger.info("[Rank %s] CUDA profiler started (nsys capture region open)", cls._get_rank())
+        return ""
+
+    @classmethod
+    def stop(cls) -> str | None:
+        """Stop the CUDA profiler range for nsys capture."""
+        if not cls._active:
+            return None
+        torch.cuda.profiler.stop()
+        cls._active = False
+        logger.info("[Rank %s] CUDA profiler stopped (nsys capture region closed)", cls._get_rank())
+        return None
+
+    @classmethod
+    def get_step_context(cls):
+        """Return an NVTX range context manager when active, else no-op."""
+        if cls._active:
+            return torch.cuda.nvtx.range("step")
+        return nullcontext()
+
+    @classmethod
+    def is_active(cls) -> bool:
+        return cls._active
diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
@@ -732,7 +732,16 @@ def _stage_worker(
 
     def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
         """Handle profiler task locally in the worker process."""
+        import torch
+
         if task_type == OmniStageTaskType.PROFILER_START:
+            # Signal nsys to begin capturing (no-op if not under nsys)
+            try:
+                torch.cuda.profiler.start()
+                logger.info("[Stage-%s] CUDA profiler started (nsys capture region open)", stage_id)
+            except Exception as e:
+                logger.warning("[Stage-%s] Failed to start CUDA profiler: %s", stage_id, e)
+
             if stage_type == "diffusion":
                 try:
                     profile_dir = _os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
@@ -751,6 +760,13 @@ def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
             return {}
 
         elif task_type == OmniStageTaskType.PROFILER_STOP:
+            # Signal nsys to stop capturing (no-op if not under nsys)
+            try:
+                torch.cuda.profiler.stop()
+                logger.info("[Stage-%s] CUDA profiler stopped (nsys capture region closed)", stage_id)
+            except Exception as e:
+                logger.warning("[Stage-%s] Failed to stop CUDA profiler: %s", stage_id, e)
+
             if stage_type == "diffusion":
                 try:
                     # CRITICAL: Capture return value
@@ -1285,7 +1301,16 @@ async def _force_log():
 
     async def handle_profiler_task_async(task_type: OmniStageTaskType) -> None:
         """Handle profiler task asynchronously for both LLM and diffusion stages."""
+        import torch
+
         if task_type == OmniStageTaskType.PROFILER_START:
+            # Signal nsys to begin capturing (no-op if not under nsys)
+            try:
+                torch.cuda.profiler.start()
+                logger.info("[Stage-%s] CUDA profiler started (nsys capture region open)", stage_id)
+            except Exception as e:
+                logger.warning("[Stage-%s] Failed to start CUDA profiler: %s", stage_id, e)
+
             if stage_type == "diffusion":
                 try:
                     # Sync call is safe here — diffusion profiling is lightweight
@@ -1304,6 +1329,13 @@ async def handle_profiler_task_async(task_type: OmniStageTaskType) -> None:
                     logger.warning("[Stage-%s] Failed to start vLLM profiler: %s", stage_id, e)
 
         elif task_type == OmniStageTaskType.PROFILER_STOP:
+            # Signal nsys to stop capturing (no-op if not under nsys)
+            try:
+                torch.cuda.profiler.stop()
+                logger.info("[Stage-%s] CUDA profiler stopped (nsys capture region closed)", stage_id)
+            except Exception as e:
+                logger.warning("[Stage-%s] Failed to stop CUDA profiler: %s", stage_id, e)
+
             if stage_type == "diffusion":
                 try:
                     trace_files = stage_engine.stop_profile()
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
@@ -736,6 +736,45 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
+@router.post("/start_profile")
+async def start_profile(raw_request: Request) -> JSONResponse:
+    """Start profiling on all stages.
+
+    When the server is running under nsys with
+    ``--capture-range=cudaProfilerApi``, this also opens the CUDA
+    profiler capture region.
+    """
+    engine_client = raw_request.app.state.engine_client
+    try:
+        await engine_client.start_profile()
+    except Exception as e:
+        logger.exception("Failed to start profile: %s", e)
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+            detail=str(e),
+        ) from e
+    return JSONResponse(content={"status": "ok"})
+
+
+@router.post("/stop_profile")
+async def stop_profile(raw_request: Request) -> JSONResponse:
+    """Stop profiling on all stages.
+
+    When running under nsys, this closes the CUDA profiler capture
+    region so nsys finalises the current capture.
+    """
+    engine_client = raw_request.app.state.engine_client
+    try:
+        await engine_client.stop_profile()
+    except Exception as e:
+        logger.exception("Failed to stop profile: %s", e)
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+            detail=str(e),
+        ) from e
+    return JSONResponse(content={"status": "ok"})
+
+
 _remove_route_from_router(router, "/v1/audio/speech", {"POST"})