Skip to content

Commit 2006fa3

Browse files
authored
Merge pull request #390 from AInVFX/main
v2.5.19: new logo, remove dead flash-attn wrapper, graceful DLL fallback, improved VRAM tracking
2 parents a06afb5 + 118c9fc commit 2006fa3

File tree

11 files changed

+249
-360
lines changed

11 files changed

+249
-360
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,15 @@ We're actively working on improvements and new features. To stay informed:
3636

3737
## 🚀 Updates
3838

39+
**2025.12.10 - Version 2.5.19**
40+
41+
- **🎨 New header logo design** - Refreshed ASCII art banner *(thanks [@naxci1](https://github.com/naxci1))*
42+
- **🧹 Remove dead flash attention wrapper** - Removed legacy code from FP8CompatibleDiT; FlashAttentionVarlen already handles backend switching via its `attention_mode` attribute
43+
- **🛡️ Fix graceful fallback from flash-attn** - Add compatibility shims for corrupted flash_attn/xformers DLLs, preventing startup crashes when CUDA extensions are broken
44+
- **📊 Improved VRAM tracking** - Separate allocated vs reserved memory tracking, Windows-only overflow detection (WDDM paging behavior)
45+
- **♻️ Centralize backend detection** - Unified `is_mps_available()`, `is_cuda_available()`, `get_gpu_backend()` helpers across codebase
46+
- **🔄 Revert 2.5.14 VRAM limit enforcement** - Removed `set_per_process_memory_fraction` call; Overflow detection and warnings remain.
47+
3948
**2025.12.09 - Version 2.5.18**
4049

4150
- **🚀 CLI: Streaming mode for long videos** - New `--chunk_size` flag processes videos in memory-bounded chunks, enabling arbitrarily long videos without RAM limits. Works with model caching (`--cache_dit`/`--cache_vae`) for chunk-to-chunk reuse *(inspired by [disk02](https://github.com/disk02) PR contribution)*
@@ -872,6 +881,7 @@ python inference_cli.py media_folder/ \
872881
- `--tile_debug`: Visualize tiles: 'false' (default), 'encode', or 'decode'
873882

874883
**Performance Optimization:**
884+
- `--allow_vram_overflow`: Allow VRAM overflow to system RAM. Prevents OOM but may cause severe slowdown
875885
- `--attention_mode`: Attention backend: 'sdpa' (default, stable) or 'flash_attn' (faster, requires package)
876886
- `--compile_dit`: Enable torch.compile for DiT model (20-40% speedup, requires PyTorch 2.0+ and Triton)
877887
- `--compile_vae`: Enable torch.compile for VAE model (15-25% speedup, requires PyTorch 2.0+ and Triton)

inference_cli.py

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
else:
7777
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "backend:cudaMallocAsync")
7878

79-
# Pre-parse CUDA device argument for validation and environment setup
79+
# Pre-parse arguments that must be handled before torch import
8080
_pre_parser = argparse.ArgumentParser(add_help=False)
8181
_pre_parser.add_argument("--cuda_device", type=str, default=None)
8282
_pre_args, _ = _pre_parser.parse_known_args()
@@ -127,24 +127,13 @@
127127
postprocess_all_batches
128128
)
129129
from src.utils.debug import Debug
130-
from src.optimization.memory_manager import clear_memory
130+
from src.optimization.memory_manager import clear_memory, get_gpu_backend, is_cuda_available
131131
debug = Debug(enabled=False) # Will be enabled via --debug CLI flag
132132

133-
134133
# =============================================================================
135134
# Device Management Helpers
136135
# =============================================================================
137136

138-
def _get_platform_type() -> str:
139-
"""Determine the platform device type (cuda/mps/cpu)."""
140-
if platform.system() == "Darwin":
141-
return "mps"
142-
elif torch.cuda.is_available():
143-
return "cuda"
144-
else:
145-
return "cpu"
146-
147-
148137
def _device_id_to_name(device_id: str, platform_type: str = None) -> str:
149138
"""
150139
Convert device ID to full device name.
@@ -160,7 +149,7 @@ def _device_id_to_name(device_id: str, platform_type: str = None) -> str:
160149
return device_id
161150

162151
if platform_type is None:
163-
platform_type = _get_platform_type()
152+
platform_type = get_gpu_backend()
164153

165154
# MPS typically doesn't use indices
166155
if platform_type == "mps":
@@ -777,7 +766,7 @@ def _process_frames_core(
777766
Upscaled frames tensor [T', H', W', C], Float32, range [0,1]
778767
"""
779768
# Determine platform and convert device IDs to full names
780-
platform_type = _get_platform_type()
769+
platform_type = get_gpu_backend()
781770
inference_device = _device_id_to_name(device_id, platform_type)
782771

783772
# Parse offload devices (with caching defaults)
@@ -1466,15 +1455,15 @@ def main() -> None:
14661455

14671456
# Inform about caching defaults
14681457
if args.cache_dit and args.dit_offload_device == "none":
1469-
offload_target = "system memory (CPU)" if _get_platform_type() != "mps" else "unified memory"
1458+
offload_target = "system memory (CPU)" if get_gpu_backend() != "mps" else "unified memory"
14701459
debug.log(
14711460
f"DiT caching enabled: Using default {offload_target} for offload. "
14721461
"Set --dit_offload_device explicitly to use a different device.",
14731462
category="cache", force=True
14741463
)
14751464

14761465
if args.cache_vae and args.vae_offload_device == "none":
1477-
offload_target = "system memory (CPU)" if _get_platform_type() != "mps" else "unified memory"
1466+
offload_target = "system memory (CPU)" if get_gpu_backend() != "mps" else "unified memory"
14781467
debug.log(
14791468
f"VAE caching enabled: Using default {offload_target} for offload. "
14801469
"Set --vae_offload_device explicitly to use a different device.",
@@ -1487,7 +1476,7 @@ def main() -> None:
14871476
else:
14881477
# Show actual CUDA device visibility
14891478
debug.log(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set (all)')}", category="device")
1490-
if torch.cuda.is_available():
1479+
if is_cuda_available():
14911480
debug.log(f"torch.cuda.device_count(): {torch.cuda.device_count()}", category="device")
14921481
debug.log(f"Using device index 0 inside script (mapped to selected GPU)", category="device")
14931482

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[project]
22
name = "seedvr2_videoupscaler"
33
description = "SeedVR2 official ComfyUI integration: ByteDance-Seed's one-step diffusion-based video/image upscaling with memory-efficient inference"
4-
version = "2.5.18"
4+
version = "2.5.19"
55
authors = [
66
{name = "numz"},
77
{name = "adrientoupet"}

src/common/distributed/basic.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import torch
2222
import torch.distributed as dist
2323
from torch.nn.parallel import DistributedDataParallel
24+
from ...optimization.memory_manager import is_mps_available
2425

2526
def get_global_rank() -> int:
2627
"""
@@ -47,7 +48,7 @@ def get_device() -> torch.device:
4748
"""
4849
Get current rank device.
4950
"""
50-
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
51+
if is_mps_available():
5152
return torch.device("mps")
5253
return torch.device("cuda", get_local_rank())
5354

src/data/image/transforms/area_resize.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from PIL import Image
2020
from torchvision.transforms import functional as TVF
2121
from torchvision.transforms.functional import InterpolationMode
22+
from ....optimization.memory_manager import is_mps_available
2223

2324

2425
class AreaResize:
@@ -31,7 +32,7 @@ def __init__(
3132
self.max_area = max_area
3233
self.downsample_only = downsample_only
3334
self.interpolation = interpolation
34-
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
35+
if is_mps_available():
3536
self.interpolation = InterpolationMode.BILINEAR
3637

3738
def __call__(self, image: Union[torch.Tensor, Image.Image]):

src/data/image/transforms/na_resize.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from .area_resize import AreaResize
2020
from .side_resize import SideResize
21+
from ....optimization.memory_manager import is_mps_available
2122

2223
def NaResize(
2324
resolution: int,
@@ -26,7 +27,7 @@ def NaResize(
2627
max_resolution: int = 0,
2728
interpolation: InterpolationMode = InterpolationMode.BICUBIC,
2829
):
29-
Interpolation = InterpolationMode.BILINEAR if (hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()) else interpolation
30+
Interpolation = InterpolationMode.BILINEAR if is_mps_available() else interpolation
3031
if mode == "area":
3132
return AreaResize(
3233
max_area=resolution**2,

src/data/image/transforms/side_resize.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from PIL import Image
1818
from torchvision.transforms import InterpolationMode
1919
from torchvision.transforms import functional as TVF
20+
from ....optimization.memory_manager import is_mps_available
2021

2122
class SideResize:
2223
def __init__(
@@ -30,7 +31,7 @@ def __init__(
3031
self.max_size = max_size
3132
self.downsample_only = downsample_only
3233
self.interpolation = interpolation
33-
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
34+
if is_mps_available():
3435
self.interpolation = InterpolationMode.BILINEAR
3536

3637
def __call__(self, image: Union[torch.Tensor, Image.Image]):

0 commit comments

Comments
 (0)