Merge pull request #344 from AInVFX/main

adrientoupet · web-flow · commit d4dd5e747d51 · 2025-12-01T00:33:53.000-05:00
v2.5.14: MPS device fix, VRAM swap detection, enforce physical VRAM limit
diff --git a/README.md b/README.md
@@ -36,6 +36,12 @@ We're actively working on improvements and new features. To stay informed:
 
 ## 🚀 Updates
 
+**2025.12.01 - Version 2.5.14**
+
+- **🍎 Fix: MPS device comparison** - Normalize device strings to prevent unnecessary tensor movements
+- **📊 Memory: VRAM swap detection** - Peak stats now show GPU+swap breakdown when overflow occurs, with warning when swap detected
+- **🛡️ Memory: Enforce physical VRAM limit** - PyTorch now OOMs instead of silently swapping to shared memory (prevents extreme slowdowns on Windows)
+
 **2025.11.30 - Version 2.5.13**
 
 - **🔧 Fix: PyTorch 2.7+ triton import error** - Resolved installation crash caused by triton.ops import chain on newer triton versions
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "seedvr2_videoupscaler"
 description = "SeedVR2 official ComfyUI integration: ByteDance-Seed's one-step diffusion-based video/image upscaling with memory-efficient inference"
-version = "2.5.13"
+version = "2.5.14"
 authors = [
     {name = "numz"},
     {name = "adrientoupet"}
diff --git a/src/optimization/memory_manager.py b/src/optimization/memory_manager.py
@@ -13,6 +13,12 @@
 from typing import Tuple, Dict, Any, Optional, List, Union
     
 
+def _device_str(device: Union[torch.device, str]) -> str:
+    """Normalized uppercase device string for comparison and logging. MPS variants → 'MPS'."""
+    s = str(device).upper()
+    return 'MPS' if s.startswith('MPS') else s
+
+
 def get_device_list(include_none: bool = False, include_cpu: bool = False) -> List[str]:
     """
     Get list of available compute devices for SeedVR2
@@ -106,6 +112,22 @@ def get_basic_vram_info(device: Optional[torch.device] = None) -> Dict[str, Any]
     print(f"⚠️ Memory check failed: {vram_info['error']} - No available backend!")
 
 
+def _enforce_vram_limit() -> None:
+    """
+    Enforce VRAM limit to physical capacity to prevent silent swap to system RAM.
+    Called once at module load. No-op on MPS or unsupported platforms.
+    """
+    if not torch.cuda.is_available():
+        return
+    try:
+        for i in range(torch.cuda.device_count()):
+            torch.cuda.set_per_process_memory_fraction(1.0, i)
+    except Exception:
+        pass
+
+_enforce_vram_limit()
+
+
 def get_vram_usage(device: Optional[torch.device] = None, debug: Optional['Debug'] = None) -> Tuple[float, float, float]:
     """
     Get current VRAM usage metrics for monitoring.
@@ -591,7 +613,7 @@ def manage_tensor(
     target_dtype = dtype if dtype is not None else current_dtype
     
     # Check if movement is actually needed
-    needs_device_move = current_device != target_device
+    needs_device_move = _device_str(current_device) != _device_str(target_device)
     needs_dtype_change = dtype is not None and current_dtype != target_dtype
     
     if not needs_device_move and not needs_dtype_change:
@@ -609,8 +631,8 @@ def manage_tensor(
     
     # Log the movement
     if debug:
-        current_device_str = str(current_device).upper()
-        target_device_str = str(target_device).upper()
+        current_device_str = _device_str(current_device)
+        target_device_str = _device_str(target_device)
         
         dtype_info = ""
         if needs_dtype_change:
@@ -681,8 +703,8 @@ def manage_model_device(model: torch.nn.Module, target_device: torch.device, mod
     
     # Extract device type for comparison (both are torch.device objects)
     target_type = target_device.type
-    current_device_upper = str(current_device).upper()
-    target_device_upper = str(target_device).upper()
+    current_device_upper = _device_str(current_device)
+    target_device_upper = _device_str(target_device)
 
     # Compare normalized device types
     if current_device_upper == target_device_upper and not is_blockswap_model:
@@ -737,10 +759,10 @@ def _handle_blockswap_model_movement(runner: Any, model: torch.nn.Module,
                 actual_source_device = param.device
                 break
         
-        source_device_desc = str(actual_source_device).upper() if actual_source_device else str(target_device).upper()
+        source_device_desc = _device_str(actual_source_device) if actual_source_device else _device_str(target_device)
         
         if debug:
-            debug.log(f"Moving {model_name} from {source_device_desc} to {str(target_device).upper()} ({reason or 'model caching'})", category="general")
+            debug.log(f"Moving {model_name} from {source_device_desc} to {_device_str(target_device)} ({reason or 'model caching'})", category="general")
         
         # Enable bypass to allow movement
         set_blockswap_bypass(runner=runner, bypass=True, debug=debug)
@@ -755,7 +777,7 @@ def _handle_blockswap_model_movement(runner: Any, model: torch.nn.Module,
         model.zero_grad(set_to_none=True)
         
         if debug:
-            debug.end_timer(timer_name, f"BlockSwap model offloaded to {str(target_device).upper()}")
+            debug.end_timer(timer_name, f"BlockSwap model offloaded to {_device_str(target_device)}")
         
         return True
         
@@ -775,10 +797,10 @@ def _handle_blockswap_model_movement(runner: Any, model: torch.nn.Module,
                 actual_current_device = param.device
                 break
         
-        current_device_desc = str(actual_current_device).upper() if actual_current_device else "OFFLOAD"
+        current_device_desc = _device_str(actual_current_device) if actual_current_device else "OFFLOAD"
         
         if debug:
-            debug.log(f"Moving {model_name} from {current_device_desc} to {str(target_device).upper()} ({reason or 'inference requirement'})", category="general")
+            debug.log(f"Moving {model_name} from {current_device_desc} to {_device_str(target_device)} ({reason or 'inference requirement'})", category="general")
         
         timer_name = f"{model_name.lower()}_to_gpu"
         if debug:
@@ -818,7 +840,7 @@ def _handle_blockswap_model_movement(runner: Any, model: torch.nn.Module,
                     blocks_on_gpu = model._block_swap_config.get('total_blocks', 32) - model._block_swap_config.get('blocks_swapped', 16)
                     total_blocks = model._block_swap_config.get('total_blocks', 32)
                     main_device = model._block_swap_config.get('main_device', 'GPU')
-                    debug.log(f"BlockSwap blocks restored to configured devices ({blocks_on_gpu}/{total_blocks} blocks on {str(main_device).upper()})", category="success")
+                    debug.log(f"BlockSwap blocks restored to configured devices ({blocks_on_gpu}/{total_blocks} blocks on {_device_str(main_device)})", category="success")
                 else:
                     debug.log("BlockSwap blocks restored to configured devices", category="success")
 
@@ -865,8 +887,8 @@ def _standard_model_movement(model: torch.nn.Module, current_device: torch.devic
     
     # Log the movement with full device strings
     if debug:
-        current_device_str = str(current_device).upper()
-        target_device_str = str(target_device).upper()
+        current_device_str = _device_str(current_device)
+        target_device_str = _device_str(target_device)
         debug.log(f"Moving {model_name} from {current_device_str} to {target_device_str} ({reason})", category="general")
 
     # Start timer based on direction
@@ -891,7 +913,7 @@ def _standard_model_movement(model: torch.nn.Module, current_device: torch.devic
     
     # End timer
     if debug:
-        debug.end_timer(timer_name, f"{model_name} moved to {str(target_device).upper()}")
+        debug.end_timer(timer_name, f"{model_name} moved to {_device_str(target_device)}")
     
     return True
 
diff --git a/src/utils/constants.py b/src/utils/constants.py
@@ -4,7 +4,7 @@
 """
 
 # Version information
-__version__ = "2.5.13"
+__version__ = "2.5.14"
 
 import os
 import warnings
diff --git a/src/utils/debug.py b/src/utils/debug.py
@@ -14,6 +14,14 @@
 from ..utils.constants import __version__
 
 
+def _format_peak_with_swap(peak_gb: float, total_vram_gb: float) -> str:
+    """Format peak memory, showing swap breakdown if overflow occurred."""
+    if total_vram_gb > 0 and peak_gb > total_vram_gb:
+        swap_gb = peak_gb - total_vram_gb
+        return f"{peak_gb:.2f}GB ({total_vram_gb:.0f}GB GPU + {swap_gb:.2f}GB swap)"
+    return f"{peak_gb:.2f}GB"
+
+
 class Debug:
     """
     Unified debug logging for generation pipeline and BlockSwap monitoring
@@ -307,7 +315,12 @@ def log_memory_state(self, label: str, show_diff: bool = True, show_tensors: boo
         if show_diff and self.memory_checkpoints:
             self._log_memory_diff(current_metrics=memory_info, force=force)
 
-       # Log detailed analysis if requested
+        # Warn if swap detected (peak > physical VRAM)
+        if memory_info['vram_total'] > 0 and memory_info['vram_peak_since_last'] > memory_info['vram_total']:
+            self.log("VRAM swap detected - severe slowdown expected. Consider optimizing (e.g., reduce resolution, batch_size, enable BlockSwap, VAE tiling...).", 
+                     level="WARNING", category="memory", force=True)
+
+        # Log detailed analysis if requested
         if detailed_tensors and tensor_stats.get('details'):
             self._log_detailed_tensor_analysis(details=tensor_stats['details'], force=force)
                 
@@ -361,9 +374,10 @@ def _collect_memory_metrics(self) -> Dict[str, Any]:
                 metrics['vram_total'] = vram_info["total_gb"]
                 
                 backend = "MPS" if (hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()) else "VRAM"
+                peak_str = _format_peak_with_swap(metrics['vram_peak_since_last'], metrics['vram_total'])
                 metrics['summary_vram'] = (f"  [{backend}] {metrics['vram_allocated']:.2f}GB allocated / "
                         f"{metrics['vram_reserved']:.2f}GB reserved / "
-                        f"Peak: {metrics['vram_peak_since_last']:.2f}GB / "
+                        f"Peak: {peak_str} / "
                         f"{metrics['vram_free']:.2f}GB free / "
                         f"{metrics['vram_total']:.2f}GB total")
             else:
@@ -525,6 +539,13 @@ def log_peak_memory_summary(self, force: bool = True) -> None:
         
         is_mps = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not torch.cuda.is_available()
         
+        # Get total VRAM for swap detection (reuse existing function)
+        total_vram_gb = 0.0
+        if not is_mps:
+            vram_info = get_basic_vram_info(device=None)
+            if "error" not in vram_info:
+                total_vram_gb = vram_info["total_gb"]
+        
         self.log("", category="none", force=force)
         self.log("────────────────────────", category="none", force=force)
         self.log("Peak memory by phase:", category="memory", force=force)
@@ -539,15 +560,15 @@ def log_peak_memory_summary(self, force: bool = True) -> None:
             if is_mps:
                 self.log(f"  Phase {phase_num} ({phase_name}): {vram:.2f}GB", category="memory", force=force)
             else:
-                self.log(f"  Phase {phase_num} ({phase_name}): VRAM {vram:.2f}GB | RAM {ram:.2f}GB", category="memory", force=force)
+                self.log(f"  Phase {phase_num} ({phase_name}): {_format_peak_with_swap(vram, total_vram_gb)} | RAM {ram:.2f}GB", category="memory", force=force)
         
         if is_mps:
             overall = max(self.phase_vram_peaks.values()) if self.phase_vram_peaks else 0
             self.log(f"Overall Peak: {overall:.2f}GB", category="memory", force=force)
         else:
             overall_vram = max(self.phase_vram_peaks.values()) if self.phase_vram_peaks else 0
             overall_ram = max(self.phase_ram_peaks.values()) if self.phase_ram_peaks else 0
-            self.log(f"Overall peak: VRAM {overall_vram:.2f}GB | RAM {overall_ram:.2f}GB", category="memory", force=force)
+            self.log(f"Overall peak: {_format_peak_with_swap(overall_vram, total_vram_gb)} | RAM {overall_ram:.2f}GB", category="memory", force=force)
     
     @torch._dynamo.disable  # Skip tracing to avoid time.time() warnings
     def _store_checkpoint(self, label: str, metrics: Dict[str, Any]) -> None: