[Profiler] Follow vLLM pattern for diffusion profiler integration

ahengljh · claude · ahengljh · commit 33e451f85e19 · 2026-02-03T16:10:59.000+08:00
Use vLLM's CudaProfilerWrapper/TorchProfilerWrapper in DiffusionWorker
instead of custom implementation. This unifies the profiler approach
between omni models and diffusion models.

- Import and use vLLM's profiler wrappers based on profiler_config
- VLLM_TORCH_CUDA_PROFILE=1 enables CudaProfilerWrapper for nsys
- VLLM_TORCH_PROFILER_DIR enables TorchProfilerWrapper for traces
- Remove dependency on CurrentProfiler from diffusion profiler module
- Update docs with vLLM-style nsys usage

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
Signed-off-by: Jinheng Li &lt;ahengljh@gmail.com&gt;
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
@@ -135,11 +135,14 @@ python image_to_video.py \
 
 ### 4. Nsight Systems Profiling (Diffusion)
 
-For deeper GPU-level analysis of diffusion workloads, use NVIDIA Nsight Systems (`nsys`). The diffusion worker integrates with nsys via `torch.cuda.profiler.start()/stop()` when profiling is triggered.
+For deeper GPU-level analysis of diffusion workloads, use NVIDIA Nsight Systems (`nsys`). Diffusion workers follow the same profiler pattern as vLLM — set `VLLM_TORCH_CUDA_PROFILE=1` to enable the CUDA profiler which signals nsys via `torch.cuda.profiler.start()/stop()`.
 
 **Usage:**
 
 ```bash
+# Enable CUDA profiler for nsys integration
+export VLLM_TORCH_CUDA_PROFILE=1
+
 nsys profile \
   --capture-range=cudaProfilerApi \
   --capture-range-end=repeat \
@@ -149,7 +152,7 @@ nsys profile \
   python image_to_video.py --model Wan-AI/Wan2.2-I2V-A14B-Diffusers ...
 ```
 
-Set `VLLM_TORCH_PROFILER_DIR` to trigger profiling, which also opens nsys capture regions in diffusion worker processes.
+The `VLLM_TORCH_CUDA_PROFILE=1` environment variable configures diffusion workers to use vLLM's `CudaProfilerWrapper`, which brackets GPU work with `torch.cuda.profiler.start()/stop()` calls that nsys captures.
 
 ```bash
 ls diffusion_trace*.nsys-rep
diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -13,9 +13,12 @@
 
 import torch
 import zmq
+from typing import Any
+
 from vllm.config import VllmConfig
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.logger import init_logger
+from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.utils.mem_utils import GiB_bytes
 
 from vllm_omni.diffusion.data import (
@@ -29,7 +32,6 @@
 )
 from vllm_omni.diffusion.forward_context import set_forward_context
 from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager
-from vllm_omni.diffusion.profiler import CurrentProfiler
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner
 from vllm_omni.lora.request import LoRARequest
@@ -65,6 +67,7 @@ def __init__(
         self.model_runner: DiffusionModelRunner | None = None
         self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
         self.lora_manager: DiffusionLoRAManager | None = None
+        self.profiler: Any | None = None
         self.init_device()
 
     def init_device(self) -> None:
@@ -89,6 +92,21 @@ def init_device(self) -> None:
         vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size
         self.vllm_config = vllm_config
 
+        # Initialize profiler based on profiler_config (follows vLLM pattern)
+        profiler_config = vllm_config.profiler_config
+        if profiler_config.profiler == "torch":
+            worker_name = f"diffusion-rank-{self.rank}"
+            self.profiler = TorchProfilerWrapper(
+                profiler_config,
+                worker_name=worker_name,
+                local_rank=self.local_rank,
+                activities=["CPU", "CUDA"],
+            )
+        elif profiler_config.profiler == "cuda":
+            self.profiler = CudaProfilerWrapper(profiler_config)
+        else:
+            self.profiler = None
+
         # Initialize distributed environment
         with set_forward_context(vllm_config=vllm_config, omni_diffusion_config=self.od_config):
             init_distributed_environment(world_size=world_size, rank=rank)
@@ -129,33 +147,27 @@ def generate(self, request: OmniDiffusionRequest) -> DiffusionOutput:
         """Generate output for the given requests."""
         return self.execute_model(request, self.od_config)
 
-    @classmethod
-    def start_profile(cls, trace_path_template: str) -> str:
+    def start_profile(self, trace_path_template: str = "") -> str:
         """Start profiling for this GPU worker.
 
-        Also opens a CUDA profiler capture region so that nsys (when
-        launched with ``--capture-range=cudaProfilerApi``) records GPU
-        activity from within this worker process.
-        """
-        if torch.cuda.is_available():
-            try:
-                torch.cuda.profiler.start()
-            except Exception as e:
-                logger.warning("Failed to start CUDA profiler in DiffusionWorker: %s", e)
-        return CurrentProfiler.start(trace_path_template)
-
-    @classmethod
-    def stop_profile(cls) -> dict | None:
-        """Stop profiling and return the result dictionary.
+        Uses vLLM's profiler wrappers based on profiler_config:
+        - 'torch': TorchProfilerWrapper for detailed CPU/CUDA traces
+        - 'cuda': CudaProfilerWrapper for nsys integration
 
-        Also closes the CUDA profiler capture region for nsys.
+        Set VLLM_TORCH_CUDA_PROFILE=1 for nsys/cuda profiler, or
+        VLLM_TORCH_PROFILER_DIR for torch profiler.
         """
-        if torch.cuda.is_available():
-            try:
-                torch.cuda.profiler.stop()
-            except Exception as e:
-                logger.warning("Failed to stop CUDA profiler in DiffusionWorker: %s", e)
-        return CurrentProfiler.stop()
+        if self.profiler is not None:
+            self.profiler.start()
+            logger.info("Diffusion worker %s: profiler started", self.rank)
+        return trace_path_template
+
+    def stop_profile(self) -> dict | None:
+        """Stop profiling and return the result dictionary."""
+        if self.profiler is not None:
+            self.profiler.stop()
+            logger.info("Diffusion worker %s: profiler stopped", self.rank)
+        return None
 
     def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput:
         """Execute a forward pass by delegating to the model runner."""