Align diffusion profiling with vLLM

ahengljh · ahengljh · commit b0c7853e8345 · 2026-02-03T16:10:59.000+08:00
Signed-off-by: Jinheng Li &lt;ahengljh@gmail.com&gt;
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
@@ -3,7 +3,7 @@
 > **Warning:** Profiling incurs significant overhead. Use only for development and debugging, never in production.
 
 vLLM-Omni supports two profiling approaches:
-- **PyTorch Profiler** — detailed CPU/CUDA traces (`.json.gz` files viewable in Perfetto)
+- **PyTorch Profiler** — detailed CPU/CUDA traces (`*.pt.trace.json` files viewable in Perfetto)
 - **Nsight Systems (nsys)** — GPU-level tracing with CUDA kernel timelines (`.nsys-rep` files)
 
 ### 1. Set the Output Directory (PyTorch Profiler)
@@ -20,6 +20,10 @@ It is best to limit profiling to one iteration to keep trace files manageable.
 ```bash
 export VLLM_PROFILER_MAX_ITERS=1
 ```
+Optionally, skip initial warmup iterations before collecting traces:
+```bash
+export VLLM_PROFILER_DELAY_ITERS=1
+```
 
 **Selective Stage Profiling**
 The profiler is default to function across all stages. But It is highly recommended to profile specific stages by passing the stages list, preventing from producing too large trace files:
@@ -142,12 +146,19 @@ For deeper GPU-level analysis of diffusion workloads, use NVIDIA Nsight Systems
 ```bash
 # Enable CUDA profiler for nsys integration
 export VLLM_TORCH_CUDA_PROFILE=1
+# Capture a fixed range of iterations (skip warmup, then capture N iters)
+export VLLM_PROFILER_DELAY_ITERS=10
+export VLLM_PROFILER_MAX_ITERS=10
+# Optional: enable NVTX ranges (used by vLLM tracing)
+export VLLM_PROFILER_TRACE_DIR=./vllm_trace
 
 nsys profile \
   --capture-range=cudaProfilerApi \
-  --capture-range-end=repeat \
+  --capture-range-end=stop \
   --trace-fork-before-exec=true \
   --cuda-graph-trace=node \
+  --sample=none \
+  --stats=true \
   -o diffusion_trace \
   python image_to_video.py --model Wan-AI/Wan2.2-I2V-A14B-Diffusers ...
 ```
@@ -166,7 +177,7 @@ Open the `.nsys-rep` file in the Nsight Systems GUI for detailed CUDA kernel tim
 Output files are saved to your configured ```VLLM_TORCH_PROFILER_DIR```.
 
 **Output**
-**Chrome Trace** (```.json.gz```): Visual timeline of kernels and stages. Open in Perfetto UI.
+**Chrome Trace** (```.pt.trace.json```): Visual timeline of kernels and stages. Open in Perfetto UI.
 
 **Viewing Tools:**
 
diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
@@ -196,63 +196,49 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest):
 
     def start_profile(self, trace_filename: str | None = None) -> None:
         """
-        Start torch profiling on all diffusion workers.
+        Start profiling on all diffusion workers.
 
-        Creates a directory (if needed) and sets up a base filename template
-        for per-rank profiler traces (typically saved as <template>_rank<N>.json).
-
-        Args:
-            trace_filename: Optional base filename (without extension or rank suffix).
-                            If None, generates one using current timestamp.
+        Profiling is configured via vLLM's profiler config/environment variables:
+        - PyTorch profiler: VLLM_TORCH_PROFILER_DIR
+        - Nsight Systems (cuda profiler): VLLM_TORCH_CUDA_PROFILE=1
         """
-        if trace_filename is None:
-            trace_filename = f"stage_0_diffusion_{int(time.time())}_rank"
-
-        trace_dir = os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
-
-        # Expand ~ and ~user, then make absolute (robust against cwd changes)
-        trace_dir = os.path.expanduser(trace_dir)
-        trace_dir = os.path.abspath(trace_dir)
-
-        try:
-            os.makedirs(trace_dir, exist_ok=True)
-        except OSError as exc:
-            logger.error(f"Failed to create profiler directory {trace_dir}: {exc}")
-            raise
-
-        # Build final template path (without rank or extension — torch.profiler appends those)
-        full_template = os.path.join(trace_dir, trace_filename)
-
-        expected_pattern = f"{full_template}*.json"
-        logger.info(f"Starting diffusion profiling → {expected_pattern}")
+        if trace_filename:
+            logger.debug(
+                "Diffusion profiling uses vLLM profiler config; trace_filename is ignored (%s).",
+                trace_filename,
+            )
 
-        # Also log the absolute directory once (useful in multi-node or containers)
-        logger.debug(f"Profiler output directory: {trace_dir}")
+        trace_dir = os.environ.get("VLLM_TORCH_PROFILER_DIR")
+        if trace_dir:
+            trace_dir = os.path.abspath(os.path.expanduser(trace_dir))
+            try:
+                os.makedirs(trace_dir, exist_ok=True)
+            except OSError as exc:
+                logger.error("Failed to create profiler directory %s: %s", trace_dir, exc)
+                raise
+            logger.info("Starting diffusion profiling. Torch traces will be written under %s", trace_dir)
+        else:
+            logger.info("Starting diffusion profiling.")
 
         # Propagate to all workers
         try:
-            self.collective_rpc(method="start_profile", args=(full_template,))
+            self.collective_rpc(method="start_profile")
         except Exception as e:
             logger.error("Failed to start profiling on workers", exc_info=True)
             raise RuntimeError(f"Could not start profiler: {e}") from e
 
     def stop_profile(self) -> dict:
         """
-        Stop profiling on all workers and collect the final trace/table paths.
-
-        The worker (torch_profiler.py) now handles trace export, compression to .gz,
-        and deletion of the original .json file. This method only collects and
-        reports the paths returned by the workers.
+        Stop profiling on all workers and best-effort collect any legacy outputs.
 
-        Returns:
-            dict with keys:
-            - "traces": list of final trace file paths (usually .json.gz)
-            - "tables": list of table strings (one per rank)
+        vLLM's profiler wrappers write traces directly to disk and do not return
+        per-rank file paths. This method preserves backward compatibility by
+        aggregating any dict-like results if present.
         """
-        logger.info("Stopping diffusion profiling and collecting results...")
+        logger.info("Stopping diffusion profiling...")
 
         try:
-            # Give worker enough time — export + compression + table can be slow
+            # Give workers enough time — trace flushing can be slow
             results = self.collective_rpc(method="stop_profile", timeout=60000)
         except Exception:
             logger.error("Failed to stop profiling on workers", exc_info=True)
@@ -262,54 +248,46 @@ def stop_profile(self) -> dict:
         successful_traces = 0
 
         if not results:
-            logger.warning("No profiling results returned from any rank")
+            logger.info("No profiling results returned from any rank.")
             return output_files
 
         for rank, res in enumerate(results):
+            if res is None:
+                # vLLM profiler wrappers return no per-rank payloads.
+                continue
             if not isinstance(res, dict):
-                logger.warning(f"Rank {rank}: invalid result format (got {type(res)})")
+                logger.warning("Rank %s: invalid result format (got %s)", rank, type(res))
                 continue
 
-            # 1. Trace file — should be .json.gz if compression succeeded
-            trace_path = res.get("trace")
+            trace_path = res.get("trace") or res.get("traces")
             if trace_path:
-                # We trust the worker — it created/compressed the file
-                logger.info(f"[Rank {rank}] Final trace: {trace_path}")
-                output_files["traces"].append(trace_path)
-                successful_traces += 1
+                if isinstance(trace_path, str):
+                    output_files["traces"].append(trace_path)
+                elif isinstance(trace_path, list):
+                    output_files["traces"].extend(trace_path)
+                successful_traces = len(output_files["traces"])
 
-                # Optional: warn if path looks suspicious (e.g. still .json)
-                if not trace_path.endswith((".json.gz", ".json")):
-                    logger.warning(f"Rank {rank}: unusual trace path extension: {trace_path}")
-
-            # 2. Table file — plain text
-            table = res.get("table")
+            table = res.get("table") or res.get("tables")
             if table:
-                output_files["tables"].append(table)
+                if isinstance(table, str):
+                    output_files["tables"].append(table)
+                elif isinstance(table, list):
+                    output_files["tables"].extend(table)
 
-        # Final summary logging
-        num_ranks = len(results)
         if successful_traces > 0:
-            final_paths_str = ", ".join(output_files["traces"][:3])
-            if len(output_files["traces"]) > 3:
-                final_paths_str += f" ... (+{len(output_files['traces']) - 3} more)"
-
             logger.info(
-                f"Profiling stopped. Collected {successful_traces} trace file(s) "
-                f"from {num_ranks} rank(s). "
-                f"Final trace paths: {final_paths_str}"
+                "Profiling stopped. Collected %s trace file(s) from %s rank(s).",
+                successful_traces,
+                len(results),
             )
-        elif output_files["traces"]:
+        else:
             logger.info(
-                f"Profiling stopped but no traces were successfully collected. "
-                f"Reported paths: {', '.join(output_files['traces'][:3])}"
-                f"{' ...' if len(output_files['traces']) > 3 else ''}"
+                "Profiling stopped. Traces are written by the active profiler "
+                "(PyTorch: VLLM_TORCH_PROFILER_DIR, nsys: -o output)."
             )
-        else:
-            logger.info("Profiling stopped — no trace files were collected from any rank.")
 
         if output_files["tables"]:
-            logger.debug(f"Collected {len(output_files['tables'])} profiling table(s)")
+            logger.debug("Collected %s profiling table(s)", len(output_files["tables"]))
 
         return output_files
 
diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -100,7 +100,6 @@ def init_device(self) -> None:
                 profiler_config,
                 worker_name=worker_name,
                 local_rank=self.local_rank,
-                activities=["CPU", "CUDA"],
             )
         elif profiler_config.profiler == "cuda":
             self.profiler = CudaProfilerWrapper(profiler_config)
@@ -179,7 +178,15 @@ def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfi
                 if req.sampling_params.lora_request is not None:
                     raise
                 logger.warning("LoRA activation skipped: %s", exc)
-        return self.model_runner.execute_model(req)
+        profiler_context = (
+            self.profiler.annotate_context_manager("diffusion_forward") if self.profiler is not None else nullcontext()
+        )
+        with profiler_context:
+            output = self.model_runner.execute_model(req)
+        if self.profiler is not None:
+            # Drive delayed start/auto-stop behavior to match vLLM's profiler wrapper.
+            self.profiler.step()
+        return output
 
     def load_weights(self, weights) -> set[str]:
         """Load weights by delegating to the model runner."""
diff --git a/vllm_omni/entrypoints/omni.py b/vllm_omni/entrypoints/omni.py
@@ -379,8 +379,10 @@ def _wait_for_stages_ready(self, timeout: int = 120) -> None:
     def start_profile(self, stages: list[int] | None = None) -> None:
         """Start profiling for specified stages.
 
-        Sends start_profile command to stage workers. Profiling must be enabled
-        via VLLM_TORCH_PROFILER_DIR environment variable.
+        Sends start_profile command to stage workers. Profiling is configured
+        via vLLM profiler environment variables, e.g.:
+        - VLLM_TORCH_PROFILER_DIR for PyTorch profiler traces
+        - VLLM_TORCH_CUDA_PROFILE=1 for Nsight Systems (cuda profiler)
 
         Args:
             stages: List of stage IDs to start profiling. If None, starts
@@ -432,6 +434,9 @@ def stop_profile(self, stages: list[int] | None = None) -> dict:
                     # This is the blocking call that triggers the RPC chain
                     stage_data = stage.stop_profile()
 
+                    if stage_data is None:
+                        continue
+
                     if isinstance(stage_data, dict):
                         # FIX: Handle both single key and list key formats
                         traces = stage_data.get("trace") or stage_data.get("traces")
@@ -457,8 +462,6 @@ def stop_profile(self, stages: list[int] | None = None) -> dict:
                                 all_results["tables"].append(tables)
                             elif isinstance(tables, list):
                                 all_results["tables"].extend(tables)
-                        else:
-                            logger.warning(f"[{self._name}] Stage-{stage_id} returned no table data")
                     else:
                         logger.warning(f"[{self._name}] Stage-{stage_id} returned non-dict data: {type(stage_data)}")
                 else:
diff --git a/vllm_omni/entrypoints/omni_diffusion.py b/vllm_omni/entrypoints/omni_diffusion.py
@@ -120,7 +120,8 @@ def start_profile(self, trace_filename: str | None = None) -> None:
 
         Args:
             trace_filename: Optional base filename for trace files.
-                           If None, a timestamp-based name will be generated.
+                Note: vLLM profiler wrappers ignore this value and write traces
+                under VLLM_TORCH_PROFILER_DIR instead.
         """
         if hasattr(self, "engine") and self.engine:
             self.engine.start_profile(trace_filename)
diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py
@@ -735,11 +735,11 @@ def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
         if task_type == OmniStageTaskType.PROFILER_START:
             if stage_type == "diffusion":
                 try:
-                    profile_dir = _os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
-                    _os.makedirs(profile_dir, exist_ok=True)
-                    trace_filename = f"stage_{stage_id}_diffusion_{int(_time.time())}"
-                    stage_engine.start_profile(trace_filename=trace_filename)
-                    logger.info("[Stage-%s] Diffusion Torch profiler started", stage_id)
+                    profile_dir = _os.environ.get("VLLM_TORCH_PROFILER_DIR")
+                    if profile_dir:
+                        _os.makedirs(profile_dir, exist_ok=True)
+                    stage_engine.start_profile()
+                    logger.info("[Stage-%s] Diffusion profiler started", stage_id)
                 except Exception as e:
                     logger.warning("[Stage-%s] Failed to start diffusion profiler: %s", stage_id, e)
             else:
@@ -753,10 +753,9 @@ def handle_profiler_task_local(task_type: OmniStageTaskType) -> dict:
         elif task_type == OmniStageTaskType.PROFILER_STOP:
             if stage_type == "diffusion":
                 try:
-                    # CRITICAL: Capture return value
                     result_data = stage_engine.stop_profile()
-                    logger.info("[Stage-%s] Diffusion Torch profiler stopped", stage_id)
-                    return result_data
+                    logger.info("[Stage-%s] Diffusion profiler stopped", stage_id)
+                    return result_data if isinstance(result_data, dict) else {}
                 except Exception as e:
                     logger.warning("[Stage-%s] Failed to stop diffusion profiler: %s", stage_id, e)
                     return {}
@@ -1289,11 +1288,11 @@ async def handle_profiler_task_async(task_type: OmniStageTaskType) -> None:
             if stage_type == "diffusion":
                 try:
                     # Sync call is safe here — diffusion profiling is lightweight
-                    profile_dir = os.environ.get("VLLM_TORCH_PROFILER_DIR", "./profiles")
-                    os.makedirs(profile_dir, exist_ok=True)
-                    trace_filename = f"stage_{stage_id}_diffusion_{int(time.time())}"
-                    stage_engine.start_profile(trace_filename=trace_filename)
-                    logger.info("[Stage-%s] Diffusion Torch profiler started", stage_id)
+                    profile_dir = os.environ.get("VLLM_TORCH_PROFILER_DIR")
+                    if profile_dir:
+                        os.makedirs(profile_dir, exist_ok=True)
+                    stage_engine.start_profile()
+                    logger.info("[Stage-%s] Diffusion profiler started", stage_id)
                 except Exception as e:
                     logger.warning("[Stage-%s] Failed to start diffusion profiler: %s", stage_id, e)
             else:
@@ -1306,10 +1305,10 @@ async def handle_profiler_task_async(task_type: OmniStageTaskType) -> None:
         elif task_type == OmniStageTaskType.PROFILER_STOP:
             if stage_type == "diffusion":
                 try:
-                    trace_files = stage_engine.stop_profile()
-                    logger.info("[Stage-%s] Diffusion Torch profiler stopped", stage_id)
-                    if trace_files:
-                        logger.info("Diffusion trace files: %s", trace_files)
+                    result_data = stage_engine.stop_profile()
+                    logger.info("[Stage-%s] Diffusion profiler stopped", stage_id)
+                    if result_data:
+                        logger.info("Diffusion profiler result: %s", result_data)
                 except Exception as e:
                     logger.warning("[Stage-%s] Failed to stop diffusion profiler: %s", stage_id, e)
             else: