vllm-project · Flink-ddd · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 4, 2026
@@ -9,10 +9,12 @@
 """
 
 import gc
+import json
 import multiprocessing as mp
 import os
 from collections.abc import Iterable
 from contextlib import AbstractContextManager, nullcontext
+from pathlib import Path
 from typing import Any
 
 import torch
@@ -44,6 +46,13 @@
 
 logger = init_logger(__name__)
 
+MODEL_PARAM_COUNTS = {
+    "bagel": 13.5e9,  # BAGEL-7B-MoT
+    "flux": 12.0e9,  # FLUX.1-dev
+    "default": 10.0e9,
+}
+ACTIVATION_MEMORY_MULTIPLIER = 2.5
+
 
 class DiffusionWorker:
     """
@@ -58,6 +67,47 @@ class DiffusionWorker:
     delegated to DiffusionModelRunner.
     """
 
+    @staticmethod
+    def predict_resource_usage(od_config: OmniDiffusionConfig) -> dict[str, float]:
+        from vllm.utils.mem_utils import GiB_bytes
+
+        total_params = 0
+        try:
+            model_path = Path(od_config.model)
+            for cfg_name in ["config.json", "llm_config.json", "diffusion_config.json"]:
+                cfg_file = model_path / cfg_name
+                if cfg_file.exists():
+                    try:
+                        with open(cfg_file) as f:
+                            data = json.load(f)
+                            total_params = data.get("num_parameters", 0) or data.get("total_params", 0)
+                            if total_params > 0:
+                                break
+                    except json.JSONDecodeError as e:
+                        logger.warning(f"Invalid JSON in {cfg_file}: {e}")
+                    except Exception as e:
+                        logger.debug(f"Error reading {cfg_file}: {e}")
+        except Exception as e:
+            logger.debug(f"Unexpected error during metadata extraction: {e}")
+        if total_params == 0:
+            m_name = str(od_config.model).lower()
+            if "bagel" in m_name:
+                total_params = MODEL_PARAM_COUNTS["bagel"]
+            elif "flux" in m_name:
+                total_params = MODEL_PARAM_COUNTS["flux"]
+            else:
+                total_params = MODEL_PARAM_COUNTS["default"]
+        dtype = getattr(od_config, "dtype", torch.bfloat16)
+        bytes_per_param = 2 if dtype in [torch.bfloat16, torch.float16] else 4
+        static_gb = (total_params * bytes_per_param) / GiB_bytes
+        h, w = getattr(od_config, "height", 1024), getattr(od_config, "width", 1024)
+        dynamic_gb = ACTIVATION_MEMORY_MULTIPLIER * (h * w / (1024 * 1024))
+        return {
+            "static_gb": round(static_gb, 2),
+            "dynamic_gb": round(dynamic_gb, 2),
+            "total_gb": round(static_gb + dynamic_gb, 2),
+        }
+
     def __init__(
         self,
         local_rank: int,

@@ -297,6 +297,63 @@ def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[st
 
         return config_path, stage_configs
 
+    def _coordinate_vram_resources(self) -> None:
+        """
+        Coordinate VRAM resource reservations across stages based on their type and configs.
+        Note: Current implementation supports per-device isolation. Advanced multi-GPU
+        scenarios like TP or PP are planned as sleep mode ack Test Scenario.
+        """
+        import torch
+        from vllm.utils.mem_utils import GiB_bytes
+
+        from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker
+
+        # Device ID
+        reserved_gb_per_device: dict[int, float] = {}
+        active_configs = self.stage_configs
+        if hasattr(self, "_single_stage_id") and self._single_stage_id is not None:
+            active_configs = [cfg for cfg in self.stage_configs if cfg.stage_id == self._single_stage_id]
+        for cfg in active_configs:
+            s_type = getattr(cfg, "stage_type", None)
+            if s_type == "diffusion":
+                prediction = DiffusionWorker.predict_resource_usage(cfg.engine_args)
+                device_str = getattr(cfg.runtime, "devices", "0")
+                try:
+                    devices = [int(d.strip()) for d in device_str.split(",")]
+                except (ValueError, AttributeError):
+                    devices = [0]
+                for d_id in devices:
+                    reserved_gb_per_device[d_id] = reserved_gb_per_device.get(d_id, 0.0) + prediction["total_gb"]
+                logger.info(
+                    f"[Coordinator] Stage-{cfg.stage_id} ({s_type.capitalize()}) "
+                    f"on devices {devices} predicted budget: {prediction['total_gb']:.2f} GiB"
+                )
+        if not torch.cuda.is_available():
+            # TODO: Add support for other accelerators (NPU/XPU)
+            # as their specific memory management APIs are integrated.
+            return
+        for cfg in active_configs:
+            if getattr(cfg, "stage_type", None) == "llm":
+                # Get the master device ID where the LLM is located
+                llm_device_str = getattr(cfg.runtime, "devices", "0")
+                try:
+                    target_device = int(llm_device_str.split(",")[0].strip())
+                except (ValueError, AttributeError):
+                    target_device = 0
+                # The physical total of the card containing LLM
+                physical_vram_gb = torch.cuda.get_device_properties(target_device).total_memory / GiB_bytes
+                total_reserved_on_this_device = reserved_gb_per_device.get(target_device, 0.0)
+                if total_reserved_on_this_device > 0:
+                    original_util = cfg.engine_args.get("gpu_memory_utilization", 0.9)
+                    reserved_util_ratio = total_reserved_on_this_device / physical_vram_gb
+                    adjusted_util = min(0.95, original_util + reserved_util_ratio)
+                    cfg.engine_args["gpu_memory_utilization"] = round(adjusted_util, 3)
+                    logger.info(
+                        f"[Coordinator] LLM Stage-{cfg.stage_id} on Device {target_device} dynamic boost: "
+                        f"{original_util} -> {cfg.engine_args['gpu_memory_utilization']} "
+                        f"(Compensating {reserved_util_ratio:.2f} ratio for resource domain isolation)"
+                    )
+
     def _initialize_stages(self, model: str, kwargs: dict[str, Any]) -> None:
         """Initialize stage list management."""
         stage_init_timeout = kwargs.get("stage_init_timeout", 20)
@@ -316,6 +373,9 @@ def _initialize_stages(self, model: str, kwargs: dict[str, Any]) -> None:
         # Resolve stage configs shared by orchestrator/headless paths.
         self.config_path, self.stage_configs = self._resolve_stage_configs(model, kwargs)
 
+        # This ensures all engines receive the corrected gpu_memory_utilization through engine_args.
+        self._coordinate_vram_resources()
+
         # Initialize connectors
         self.omni_transfer_config, self.connectors = initialize_orchestrator_connectors(
             self.config_path, worker_backend=worker_backend, shm_threshold_bytes=shm_threshold_bytes

@@ -112,6 +112,10 @@ def get_free_memory(cls, device: torch.device | None = None) -> int:
         free, _ = torch.cuda.mem_get_info(device)
         return free
 
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        return torch.cuda.get_device_properties(device_id).total_memory
+
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
         return torch.cuda.get_device_name(device_id)
@@ -99,3 +99,7 @@ def synchronize(cls) -> None:
     def get_free_memory(cls, device: torch.device | None = None) -> int:
         free, _ = torch.cuda.mem_get_info(device)
         return free
+
+    @classmethod
+    def get_device_total_memory(cls, device_id: int = 0) -> int:
+        return torch.cuda.get_device_properties(device_id).total_memory