Merge pull request #358 from AInVFX/main

adrientoupet · web-flow · commit f68fe920b85e · 2025-12-03T13:16:46.000-05:00
v2.5.15: MPS compatibility fixes, autocast device type, VRAM tracking, triton 3.0+ compatibility
diff --git a/README.md b/README.md
@@ -36,6 +36,13 @@ We're actively working on improvements and new features. To stay informed:
 
 ## 🚀 Updates
 
+**2025.12.03 - Version 2.5.15**
+
+- **🍎 Fix: MPS compatibility** - Disable antialias for MPS tensors and fix bfloat16 arange issues
+- **⚡ Fix: Autocast device type** - Use proper device type attribute to prevent autocast errors
+- **📊 Memory: Accurate VRAM tracking** - Use max_memory_reserved for more precise peak reporting
+- **🔧 Fix: Triton compatibility** - Add shim for bitsandbytes 0.45+ / triton 3.0+ (fixes PyTorch 2.7 installation errors)
+
 **2025.12.01 - Version 2.5.14**
 
 - **🍎 Fix: MPS device comparison** - Normalize device strings to prevent unnecessary tensor movements
diff --git a/__init__.py b/__init__.py
@@ -3,6 +3,7 @@
 Official SeedVR2 integration for ComfyUI
 """
 
+from .src.optimization.compatibility import ensure_triton_compat  # noqa: F401
 from .src.interfaces import comfy_entrypoint, SeedVR2Extension
 
 __all__ = ["comfy_entrypoint", "SeedVR2Extension"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "seedvr2_videoupscaler"
 description = "SeedVR2 official ComfyUI integration: ByteDance-Seed's one-step diffusion-based video/image upscaling with memory-efficient inference"
-version = "2.5.14"
+version = "2.5.15"
 authors = [
     {name = "numz"},
     {name = "adrientoupet"}
diff --git a/src/common/diffusion/timesteps/sampling/trailing.py b/src/common/diffusion/timesteps/sampling/trailing.py
@@ -36,7 +36,7 @@ def __init__(
         dtype: torch.dtype = torch.float32,
     ):
         # Create trailing timesteps with specified dtype
-        timesteps = torch.arange(1.0, 0.0, -1.0 / steps, device=device, dtype=dtype)
+        timesteps = torch.arange(1.0, 0.0, -1.0 / steps, device='cpu').to(device=device, dtype=dtype)
 
         # Shift timesteps.
         timesteps = shift * timesteps / (1 + (shift - 1) * timesteps)
diff --git a/src/core/generation_phases.py b/src/core/generation_phases.py
@@ -711,7 +711,7 @@ def _add_noise(x, aug_noise):
             debug.start_timer(f"dit_inference_{upscale_idx+1}")
             with torch.no_grad():
                 if dit_dtype != ctx['compute_dtype']:
-                    with torch.autocast(str(ctx['dit_device']), ctx['compute_dtype'], enabled=True):
+                    with torch.autocast(ctx['dit_device'].type, ctx['compute_dtype'], enabled=True):
                         upscaled_latents = runner.inference(
                             noises=noises,
                             conditions=conditions,
diff --git a/src/core/infer.py b/src/core/infer.py
@@ -155,7 +155,7 @@ def vae_encode(self, samples: List[Tensor]) -> List[Tensor]:
 
                 # Use autocast if VAE dtype differs from input dtype
                 if vae_dtype != sample.dtype:
-                    with torch.autocast(str(device), sample.dtype, enabled=True):
+                    with torch.autocast(device.type, sample.dtype, enabled=True):
                         if use_sample:
                             latent = self.vae.encode(sample, tiled=self.encode_tiled, tile_size=self.encode_tile_size, 
                                                     tile_overlap=self.encode_tile_overlap).latent
@@ -231,7 +231,7 @@ def vae_decode(self, latents: List[Tensor]) -> List[Tensor]:
 
                 # Use autocast if VAE dtype differs from latent dtype
                 if vae_dtype != latent.dtype:
-                    with torch.autocast(str(device), latent.dtype, enabled=True):
+                    with torch.autocast(device.type, latent.dtype, enabled=True):
                         sample = self.vae.decode(
                             latent,
                             tiled=self.decode_tiled, tile_size=self.decode_tile_size,
diff --git a/src/data/image/transforms/area_resize.py b/src/data/image/transforms/area_resize.py
@@ -50,10 +50,12 @@ def __call__(self, image: Union[torch.Tensor, Image.Image]):
 
         resized_height, resized_width = round(height * scale), round(width * scale)
 
+        antialias = not (isinstance(image, torch.Tensor) and image.device.type == 'mps')
         return TVF.resize(
             image,
             size=(resized_height, resized_width),
             interpolation=self.interpolation,
+            antialias=antialias,
         )
 
 
diff --git a/src/data/image/transforms/side_resize.py b/src/data/image/transforms/side_resize.py
@@ -56,8 +56,9 @@ def __call__(self, image: Union[torch.Tensor, Image.Image]):
         else:
             size = self.size
 
-        # Resize to shortest edge
-        resized = TVF.resize(image, size, self.interpolation)
+        # Resize to shortest edge (disable antialias only for MPS tensors - not supported)
+        antialias = not (isinstance(image, torch.Tensor) and image.device.type == 'mps')
+        resized = TVF.resize(image, size, self.interpolation, antialias=antialias)
         
         # Apply max_size constraint if specified
         if self.max_size > 0:
@@ -69,6 +70,6 @@ def __call__(self, image: Union[torch.Tensor, Image.Image]):
             if max(h, w) > self.max_size:
                 scale = self.max_size / max(h, w)
                 new_h, new_w = round(h * scale), round(w * scale)
-                resized = TVF.resize(resized, (new_h, new_w), self.interpolation)
+                resized = TVF.resize(resized, (new_h, new_w), self.interpolation, antialias=antialias)
         
         return resized
diff --git a/src/models/video_vae_v3/modules/attn_video_vae.py b/src/models/video_vae_v3/modules/attn_video_vae.py
@@ -17,6 +17,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.models.attention_processor import Attention, SpatialNorm
+from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
 from diffusers.models.downsampling import Downsample2D
 from diffusers.models.lora import LoRACompatibleConv
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
@@ -45,8 +46,6 @@
     CausalAutoencoderOutput,
     CausalDecoderOutput,
     CausalEncoderOutput,
-    DecoderOutput,
-    DiagonalGaussianDistribution,
     MemoryState,
     _inflation_mode_t,
     _memory_device_t,
diff --git a/src/models/video_vae_v3/modules/types.py b/src/models/video_vae_v3/modules/types.py
@@ -74,51 +74,3 @@ class CausalEncoderOutput(NamedTuple):
 
 class CausalDecoderOutput(NamedTuple):
     sample: torch.Tensor
-
-
-class DecoderOutput:
-    """Output of decoding method - matches diffusers.models.autoencoders.vae.DecoderOutput"""
-    def __init__(self, sample: torch.Tensor, commit_loss: Optional[torch.Tensor] = None):
-        self.sample = sample
-        self.commit_loss = commit_loss
-
-
-class DiagonalGaussianDistribution:
-    """Matches diffusers.models.autoencoders.vae.DiagonalGaussianDistribution exactly."""
-    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
-        self.parameters = parameters
-        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
-        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = torch.exp(0.5 * self.logvar)
-        self.var = torch.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = torch.zeros_like(
-                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
-            )
-
-    def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor:
-        if self.deterministic:
-            return self.mode()
-        sample = torch.randn(
-            self.mean.shape,
-            generator=generator,
-            device=self.parameters.device,
-            dtype=self.parameters.dtype,
-        )
-        return self.mean + self.std * sample
-
-    def mode(self) -> torch.Tensor:
-        return self.mean
-
-    def kl(self, other: Optional["DiagonalGaussianDistribution"] = None) -> torch.Tensor:
-        if other is None:
-            return 0.5 * torch.sum(
-                self.mean.pow(2) + self.var - 1.0 - self.logvar,
-                dim=[1, 2, 3],
-            )
-        return 0.5 * torch.sum(
-            (self.mean - other.mean).pow(2) / other.var
-            + self.var / other.var - 1.0 - self.logvar + other.logvar,
-            dim=[1, 2, 3],
-        )
diff --git a/src/models/video_vae_v3/modules/video_vae.py b/src/models/video_vae_v3/modules/video_vae.py
@@ -15,6 +15,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
 from einops import rearrange
 from ....common.half_precision_fixes import safe_pad_operation
 
@@ -35,7 +36,6 @@
     CausalAutoencoderOutput,
     CausalDecoderOutput,
     CausalEncoderOutput,
-    DiagonalGaussianDistribution,
     MemoryState,
     _inflation_mode_t,
     _memory_device_t,
diff --git a/src/optimization/compatibility.py b/src/optimization/compatibility.py
@@ -5,6 +5,37 @@
 Extracted from: seedvr2.py (lines 1045-1630)
 """
 
+# Triton compatibility shim for bitsandbytes 0.45+ with triton 3.0+
+# Must be called before any diffusers import
+import sys
+
+def ensure_triton_compat():
+    """Create minimal triton.ops stubs only if missing, to allow bitsandbytes import."""
+    if 'triton.ops.matmul_perf_model' in sys.modules:
+        return
+    
+    try:
+        from triton.ops.matmul_perf_model import early_config_prune  # noqa: F401
+        return
+    except (ImportError, ModuleNotFoundError, AttributeError):
+        pass
+    
+    import types
+    
+    if 'triton.ops' not in sys.modules:
+        sys.modules['triton.ops'] = types.ModuleType('triton.ops')
+    
+    matmul_perf = types.ModuleType('triton.ops.matmul_perf_model')
+    matmul_perf.early_config_prune = lambda configs, *a, **kw: configs
+    matmul_perf.estimate_matmul_time = lambda *a, **kw: 0.0
+    
+    sys.modules['triton.ops'].matmul_perf_model = matmul_perf
+    sys.modules['triton.ops.matmul_perf_model'] = matmul_perf
+
+# Run immediately on import
+ensure_triton_compat()
+
+
 import torch
 import types
 import os
diff --git a/src/optimization/memory_manager.py b/src/optimization/memory_manager.py
@@ -138,7 +138,7 @@ def get_vram_usage(device: Optional[torch.device] = None, debug: Optional['Debug
         debug: Optional debug instance for logging
     
     Returns:
-        tuple: (allocated_gb, reserved_gb, max_allocated_gb)
+        tuple: (allocated_gb, reserved_gb, max_reserved_gb)
                Returns (0, 0, 0) if no GPU available
     """
     try:
@@ -149,8 +149,8 @@ def get_vram_usage(device: Optional[torch.device] = None, debug: Optional['Debug
                 device = torch.device(device)
             allocated = torch.cuda.memory_allocated(device) / (1024**3)
             reserved = torch.cuda.memory_reserved(device) / (1024**3)
-            max_allocated = torch.cuda.max_memory_allocated(device) / (1024**3)
-            return allocated, reserved, max_allocated
+            max_reserved = torch.cuda.max_memory_reserved(device) / (1024**3)
+            return allocated, reserved, max_reserved
         elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
             # MPS doesn't support per-device queries - uses global memory tracking
             allocated = torch.mps.current_allocated_memory() / (1024**3)
diff --git a/src/utils/constants.py b/src/utils/constants.py
@@ -4,7 +4,7 @@
 """
 
 # Version information
-__version__ = "2.5.14"
+__version__ = "2.5.15"
 
 import os
 import warnings

Original file line number	Diff line number	Diff line change
`@@ -50,10 +50,12 @@ def __call__(self, image: Union[torch.Tensor, Image.Image]):`
`50`	`50`
`51`	`51`	`resized_height, resized_width = round(height * scale), round(width * scale)`
`52`	`52`
	`53`	`+ antialias = not (isinstance(image, torch.Tensor) and image.device.type == 'mps')`
`53`	`54`	`return TVF.resize(`
`54`	`55`	`image,`
`55`	`56`	`size=(resized_height, resized_width),`
`56`	`57`	`interpolation=self.interpolation,`
	`58`	`+ antialias=antialias,`
`57`	`59`	`)`
`58`	`60`
`59`	`61`