numz
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎inference_cli.py‎
Lines changed: 7 additions & 18 deletions b/‎inference_cli.py‎
Lines changed: 7 additions & 18 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/common/distributed/basic.py‎
Lines changed: 2 additions & 1 deletion b/‎src/common/distributed/basic.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/data/image/transforms/area_resize.py‎
Lines changed: 2 additions & 1 deletion b/‎src/data/image/transforms/area_resize.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/data/image/transforms/na_resize.py‎
Lines changed: 2 additions & 1 deletion b/‎src/data/image/transforms/na_resize.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/data/image/transforms/side_resize.py‎
Lines changed: 2 additions & 1 deletion b/‎src/data/image/transforms/side_resize.py‎
Lines changed: 2 additions & 1 deletion
@@ -36,6 +36,15 @@ We're actively working on improvements and new features. To stay informed:
 
 ## 🚀 Updates
 
+**2025.12.10 - Version 2.5.19**
+
+- **🎨 New header logo design** - Refreshed ASCII art banner *(thanks [@naxci1](https://github.com/naxci1))*
+- **🧹 Remove dead flash attention wrapper** - Removed legacy code from FP8CompatibleDiT; FlashAttentionVarlen already handles backend switching via its `attention_mode` attribute
+- **🛡️ Fix graceful fallback from flash-attn** - Add compatibility shims for corrupted flash_attn/xformers DLLs, preventing startup crashes when CUDA extensions are broken
+- **📊 Improved VRAM tracking** - Separate allocated vs reserved memory tracking, Windows-only overflow detection (WDDM paging behavior)
+- **♻️ Centralize backend detection** - Unified `is_mps_available()`, `is_cuda_available()`, `get_gpu_backend()` helpers across codebase
+- **🔄 Revert 2.5.14 VRAM limit enforcement** - Removed `set_per_process_memory_fraction` call; Overflow detection and warnings remain.
+
 **2025.12.09 - Version 2.5.18**
 
 - **🚀 CLI: Streaming mode for long videos** - New `--chunk_size` flag processes videos in memory-bounded chunks, enabling arbitrarily long videos without RAM limits. Works with model caching (`--cache_dit`/`--cache_vae`) for chunk-to-chunk reuse *(inspired by [disk02](https://github.com/disk02) PR contribution)*
@@ -872,6 +881,7 @@ python inference_cli.py media_folder/ \
 - `--tile_debug`: Visualize tiles: 'false' (default), 'encode', or 'decode'
 
 **Performance Optimization:**
+- `--allow_vram_overflow`: Allow VRAM overflow to system RAM. Prevents OOM but may cause severe slowdown
 - `--attention_mode`: Attention backend: 'sdpa' (default, stable) or 'flash_attn' (faster, requires package)
 - `--compile_dit`: Enable torch.compile for DiT model (20-40% speedup, requires PyTorch 2.0+ and Triton)
 - `--compile_vae`: Enable torch.compile for VAE model (15-25% speedup, requires PyTorch 2.0+ and Triton)
 
@@ -76,7 +76,7 @@
 else:
     os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "backend:cudaMallocAsync")
 
-    # Pre-parse CUDA device argument for validation and environment setup
+    # Pre-parse arguments that must be handled before torch import
     _pre_parser = argparse.ArgumentParser(add_help=False)
     _pre_parser.add_argument("--cuda_device", type=str, default=None)
     _pre_args, _ = _pre_parser.parse_known_args()
@@ -127,24 +127,13 @@
     postprocess_all_batches
 )
 from src.utils.debug import Debug
-from src.optimization.memory_manager import clear_memory
+from src.optimization.memory_manager import clear_memory, get_gpu_backend, is_cuda_available
 debug = Debug(enabled=False)  # Will be enabled via --debug CLI flag
 
-
 # =============================================================================
 # Device Management Helpers
 # =============================================================================
 
-def _get_platform_type() -> str:
-    """Determine the platform device type (cuda/mps/cpu)."""
-    if platform.system() == "Darwin":
-        return "mps"
-    elif torch.cuda.is_available():
-        return "cuda"
-    else:
-        return "cpu"
-
-
 def _device_id_to_name(device_id: str, platform_type: str = None) -> str:
     """
     Convert device ID to full device name.
@@ -160,7 +149,7 @@ def _device_id_to_name(device_id: str, platform_type: str = None) -> str:
         return device_id
 
     if platform_type is None:
-        platform_type = _get_platform_type()
+        platform_type = get_gpu_backend()
 
     # MPS typically doesn't use indices
     if platform_type == "mps":
@@ -777,7 +766,7 @@ def _process_frames_core(
         Upscaled frames tensor [T', H', W', C], Float32, range [0,1]
     """    
     # Determine platform and convert device IDs to full names
-    platform_type = _get_platform_type()
+    platform_type = get_gpu_backend()
     inference_device = _device_id_to_name(device_id, platform_type)
 
     # Parse offload devices (with caching defaults)
@@ -1466,15 +1455,15 @@ def main() -> None:
 
     # Inform about caching defaults
     if args.cache_dit and args.dit_offload_device == "none":
-        offload_target = "system memory (CPU)" if _get_platform_type() != "mps" else "unified memory"
+        offload_target = "system memory (CPU)" if get_gpu_backend() != "mps" else "unified memory"
         debug.log(
             f"DiT caching enabled: Using default {offload_target} for offload. "
             "Set --dit_offload_device explicitly to use a different device.",
             category="cache", force=True
         )
 
     if args.cache_vae and args.vae_offload_device == "none":
-        offload_target = "system memory (CPU)" if _get_platform_type() != "mps" else "unified memory"
+        offload_target = "system memory (CPU)" if get_gpu_backend() != "mps" else "unified memory"
         debug.log(
             f"VAE caching enabled: Using default {offload_target} for offload. "
             "Set --vae_offload_device explicitly to use a different device.",
@@ -1487,7 +1476,7 @@ def main() -> None:
         else:
             # Show actual CUDA device visibility
             debug.log(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set (all)')}", category="device")
-            if torch.cuda.is_available():
+            if is_cuda_available():
                 debug.log(f"torch.cuda.device_count(): {torch.cuda.device_count()}", category="device")
                 debug.log(f"Using device index 0 inside script (mapped to selected GPU)", category="device")
 
 
@@ -1,7 +1,7 @@
 [project]
 name = "seedvr2_videoupscaler"
 description = "SeedVR2 official ComfyUI integration: ByteDance-Seed's one-step diffusion-based video/image upscaling with memory-efficient inference"
-version = "2.5.18"
+version = "2.5.19"
 authors = [
     {name = "numz"},
     {name = "adrientoupet"}
 
@@ -21,6 +21,7 @@
 import torch
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel
+from ...optimization.memory_manager import is_mps_available
 
 def get_global_rank() -> int:
     """
@@ -47,7 +48,7 @@ def get_device() -> torch.device:
     """
     Get current rank device.
     """
-    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+    if is_mps_available():
         return torch.device("mps")
     return torch.device("cuda", get_local_rank())
 
 
@@ -19,6 +19,7 @@
 from PIL import Image
 from torchvision.transforms import functional as TVF
 from torchvision.transforms.functional import InterpolationMode
+from ....optimization.memory_manager import is_mps_available
 
 
 class AreaResize:
@@ -31,7 +32,7 @@ def __init__(
         self.max_area = max_area
         self.downsample_only = downsample_only
         self.interpolation = interpolation
-        if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        if is_mps_available():
             self.interpolation = InterpolationMode.BILINEAR
 
     def __call__(self, image: Union[torch.Tensor, Image.Image]):
 
@@ -18,6 +18,7 @@
 
 from .area_resize import AreaResize
 from .side_resize import SideResize
+from ....optimization.memory_manager import is_mps_available
 
 def NaResize(
     resolution: int,
@@ -26,7 +27,7 @@ def NaResize(
     max_resolution: int = 0,
     interpolation: InterpolationMode = InterpolationMode.BICUBIC,
 ):
-    Interpolation = InterpolationMode.BILINEAR if (hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()) else interpolation
+    Interpolation = InterpolationMode.BILINEAR if is_mps_available() else interpolation
     if mode == "area":
         return AreaResize(
             max_area=resolution**2,
 
@@ -17,6 +17,7 @@
 from PIL import Image
 from torchvision.transforms import InterpolationMode
 from torchvision.transforms import functional as TVF
+from ....optimization.memory_manager import is_mps_available
 
 class SideResize:
     def __init__(
@@ -30,7 +31,7 @@ def __init__(
         self.max_size = max_size
         self.downsample_only = downsample_only
         self.interpolation = interpolation
-        if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        if is_mps_available():
             self.interpolation = InterpolationMode.BILINEAR
 
     def __call__(self, image: Union[torch.Tensor, Image.Image]):