Merge pull request #407 from AInVFX/main

adrientoupet · web-flow · commit 32f9900ecd50 · 2025-12-12T11:28:07.000-05:00
v2.5.21: fix GGUF dequant regression, MPS performance optimizations
diff --git a/README.md b/README.md
@@ -36,6 +36,13 @@ We're actively working on improvements and new features. To stay informed:
 
 ## 🚀 Release Notes
 
+**2025.12.12 - Version 2.5.21**
+
+- **🛠️ Fix: GGUF dequantization error on MPS** - Resolved shape mismatch error introduced in 2.5.20 by skipping GGUF quantized buffers in precision conversion - these must remain in packed format for on-the-fly dequantization during inference
+- **🍎 MPS: Eliminate CPU sync overhead** - Skip unnecessary CPU tensor offload on Apple Silicon unified memory architecture, preventing sync stalls that caused slowdowns. Input images and output video now stay on MPS device throughout the pipeline
+- **⚡ MPS: Preload text embeddings** - Load text embeddings before Phase 1 encoding to avoid sync stall at Phase 2 start, improving timing accuracy and throughput
+- **🧹 MPS: Optimized model cleanup** - Skip redundant CPU movement before model deletion on unified memory
+
 **2025.12.12 - Version 2.5.20**
 
 - **⚡ Expanded attention backends** - Full support for Flash Attention 2 (Ampere+), Flash Attention 3 (Hopper+), SageAttention 2, and SageAttention 3 (Blackwell/RTX 50xx), with automatic fallback chains to PyTorch SDPA when unavailable *(based on PR by [@naxci1](https://github.com/naxci1) - thank you!)*
diff --git a/inference_cli.py b/inference_cli.py
@@ -118,7 +118,9 @@
     prepare_runner, 
     compute_generation_info, 
     log_generation_start,
-    blend_overlapping_frames
+    blend_overlapping_frames,
+    load_text_embeddings,
+    script_directory
 )
 from src.core.generation_phases import (
     encode_all_batches, 
@@ -858,6 +860,10 @@ def _process_frames_core(
     if runner_cache is not None:
         runner_cache['runner'] = runner
     
+    # Preload text embeddings before Phase 1 to avoid sync stall in Phase 2
+    ctx['text_embeds'] = load_text_embeddings(script_directory, ctx['dit_device'], ctx['compute_dtype'], debug)
+    debug.log("Loaded text embeddings for DiT", category="dit")
+    
     # Compute generation info and log start (handles prepending internally)
     frames_tensor, gen_info = compute_generation_info(
         ctx=ctx,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "seedvr2_videoupscaler"
 description = "SeedVR2 official ComfyUI integration: ByteDance-Seed's one-step diffusion-based video/image upscaling with memory-efficient inference"
-version = "2.5.20"
+version = "2.5.21"
 authors = [
     {name = "numz"},
     {name = "adrientoupet"}
diff --git a/src/core/generation_phases.py b/src/core/generation_phases.py
@@ -231,7 +231,11 @@ def encode_all_batches(
     if images is None:
         raise ValueError("Images to encode must be provided")
     else:
-        ctx['input_images'] = images
+        # MPS: keep on device to avoid sync overhead in Phase 4 color correction
+        if ctx['vae_device'].type == 'mps' and images.device.type != 'mps':
+            ctx['input_images'] = images.to(ctx['vae_device'])
+        else:
+            ctx['input_images'] = images
     
     # Get total frame count from context (set in video_upscaler before encoding)
     total_frames = ctx.get('total_frames', len(images))
@@ -529,6 +533,10 @@ def encode_all_batches(
             manage_model_device(model=runner.vae, target_device=ctx['vae_offload_device'], 
                                 model_name="VAE", debug=debug, reason="VAE offload", runner=runner)
     
+    # MPS: sync to get accurate timing and free memory before Phase 2
+    if ctx['vae_device'].type == 'mps':
+        torch.mps.synchronize()
+    
     debug.end_timer("phase1_encoding", "Phase 1: VAE encoding complete", show_breakdown=True)
     debug.log_memory_state("After phase 1 (VAE encoding)", show_tensors=False)
     
@@ -860,7 +868,13 @@ def decode_all_batches(
     
     # Pre-allocate final_video at the START of decode phase (before any batch processing)
     # This ensures we only need memory for final_video + 1 batch, not final_video + all batch_samples
-    target_device = ctx['tensor_offload_device'] if ctx['tensor_offload_device'] is not None else 'cpu'
+    # MPS: keep on device (unified memory, no benefit to CPU offload)
+    if ctx['tensor_offload_device'] is not None:
+        target_device = ctx['tensor_offload_device']
+    elif ctx['vae_device'].type == 'mps':
+        target_device = ctx['vae_device']
+    else:
+        target_device = 'cpu'
     channels_str = "RGBA" if C == 4 else "RGB"
     required_gb = (total_frames * true_h * true_w * C * 2) / (1024**3)
     debug.log(f"Pre-allocating output tensor: {total_frames} frames, {true_w}x{true_h}px, {channels_str} ({required_gb:.2f}GB)", 
@@ -1040,6 +1054,10 @@ def decode_all_batches(
         if 'all_upscaled_latents' in ctx:
             release_tensor_collection(ctx['all_upscaled_latents'])
             del ctx['all_upscaled_latents']
+    
+    # MPS: sync to get accurate timing and free memory before Phase 4
+    if ctx['vae_device'].type == 'mps':
+        torch.mps.synchronize()
         
     debug.end_timer("phase3_decoding", "Phase 3: VAE decoding complete", show_breakdown=True)
     debug.log_memory_state("After phase 3 (VAE decoding)", show_tensors=False)
diff --git a/src/core/generation_utils.py b/src/core/generation_utils.py
@@ -350,7 +350,12 @@ def _normalize_device(device_spec: Optional[Union[str, torch.device]]) -> torch.
     vae_device = _normalize_device(vae_device)
     dit_offload_device = _normalize_device(dit_offload_device) if dit_offload_device is not None else None
     vae_offload_device = _normalize_device(vae_offload_device) if vae_offload_device is not None else None
-    tensor_offload_device = _normalize_device(tensor_offload_device) if tensor_offload_device is not None else None
+    # MPS unified memory: CPU offload causes sync overhead with no memory benefit
+    is_mps = dit_device.type == 'mps' or vae_device.type == 'mps'
+    if is_mps and tensor_offload_device is not None and str(tensor_offload_device) == 'cpu':
+        tensor_offload_device = None
+    else:
+        tensor_offload_device = _normalize_device(tensor_offload_device) if tensor_offload_device is not None else None
     
     # Set LOCAL_RANK to 0 for single-GPU inference mode
     # CLI multi-GPU uses CUDA_VISIBLE_DEVICES to restrict visibility per worker
diff --git a/src/interfaces/video_upscaler.py b/src/interfaces/video_upscaler.py
@@ -19,7 +19,9 @@
     setup_generation_context, 
     prepare_runner,
     compute_generation_info,
-    log_generation_start
+    log_generation_start,
+    load_text_embeddings,
+    script_directory
 )
 from ..optimization.memory_manager import (
     cleanup_text_embeddings,
@@ -437,6 +439,10 @@ def cleanup(dit_cache: bool = False, vae_cache: bool = False) -> None:
             # Store cache context in ctx for use in generation phases
             ctx['cache_context'] = cache_context
 
+            # Preload text embeddings before Phase 1 to avoid sync stall in Phase 2
+            ctx['text_embeds'] = load_text_embeddings(script_directory, ctx['dit_device'], ctx['compute_dtype'], debug)
+            debug.log("Loaded text embeddings for DiT", category="dit")
+
             debug.log_memory_state("After model preparation", show_tensors=False, detailed_tensors=False)
             debug.end_timer("model_preparation", "Model preparation", force=True, show_breakdown=True)
             
diff --git a/src/optimization/compatibility.py b/src/optimization/compatibility.py
@@ -826,8 +826,11 @@ def _force_nadit_precision(self, target_dtype: torch.dtype = torch.bfloat16) ->
                     param.data = param.data.to(target_dtype)
                 converted_count += 1
                 
-        # Also convert buffers
+        # Also convert buffers (skip GGUF quantized buffers - they have tensor_type attribute)
         for name, buffer in self.dit_model.named_buffers():
+            # Skip GGUF quantized buffers - these must stay in packed format for on-the-fly dequantization
+            if hasattr(buffer, 'tensor_type'):
+                continue
             if buffer.dtype != target_dtype:
                 if buffer.device.type == "mps":
                     temp_cpu = buffer.data.to("cpu")
diff --git a/src/optimization/memory_manager.py b/src/optimization/memory_manager.py
@@ -1050,15 +1050,17 @@ def cleanup_dit(runner: Any, debug: Optional['Debug'] = None, cache_model: bool
         
         # Move model off GPU if needed
         if param_device.type not in ['meta', 'cpu']:
-            # Get offload target - default to 'cpu' if not configured or set to 'none'
-            offload_target = getattr(runner, '_dit_offload_device', None)
-            if offload_target is None or offload_target == 'none':
-                offload_target = torch.device('cpu')
-            
-            # Move model off GPU (either for caching or before deletion)
-            reason = "model caching" if cache_model else "releasing GPU memory"
-            manage_model_device(model=runner.dit, target_device=offload_target, model_name="DiT", 
-                               debug=debug, reason=reason, runner=runner)
+            # MPS: skip CPU movement before deletion (unified memory, just causes sync)
+            if param_device.type == 'mps' and not cache_model:
+                if debug:
+                    debug.log("DiT on MPS - skipping CPU movement before deletion", category="cleanup")
+            else:
+                offload_target = getattr(runner, '_dit_offload_device', None)
+                if offload_target is None or offload_target == 'none':
+                    offload_target = torch.device('cpu')
+                reason = "model caching" if cache_model else "releasing GPU memory"
+                manage_model_device(model=runner.dit, target_device=offload_target, model_name="DiT", 
+                                   debug=debug, reason=reason, runner=runner)
         elif param_device.type == 'meta' and debug:
             debug.log("DiT on meta device - keeping structure for cache", category="cleanup")
     except StopIteration:
@@ -1126,15 +1128,17 @@ def cleanup_vae(runner: Any, debug: Optional['Debug'] = None, cache_model: bool
         
         # Move model off GPU if needed
         if param_device.type not in ['meta', 'cpu']:
-            # Get offload target - default to 'cpu' if not configured or set to 'none'
-            offload_target = getattr(runner, '_vae_offload_device', None)
-            if offload_target is None or offload_target == 'none':
-                offload_target = torch.device('cpu')
-            
-            # Move model off GPU (either for caching or before deletion)
-            reason = "model caching" if cache_model else "releasing GPU memory"
-            manage_model_device(model=runner.vae, target_device=offload_target, model_name="VAE", 
-                               debug=debug, reason=reason, runner=runner)
+            # MPS: skip CPU movement before deletion (unified memory, just causes sync)
+            if param_device.type == 'mps' and not cache_model:
+                if debug:
+                    debug.log("VAE on MPS - skipping CPU movement before deletion", category="cleanup")
+            else:
+                offload_target = getattr(runner, '_vae_offload_device', None)
+                if offload_target is None or offload_target == 'none':
+                    offload_target = torch.device('cpu')
+                reason = "model caching" if cache_model else "releasing GPU memory"
+                manage_model_device(model=runner.vae, target_device=offload_target, model_name="VAE", 
+                                   debug=debug, reason=reason, runner=runner)
         elif param_device.type == 'meta' and debug:
             debug.log("VAE on meta device - keeping structure for cache", category="cleanup")
     except StopIteration:
diff --git a/src/utils/constants.py b/src/utils/constants.py
@@ -4,7 +4,7 @@
 """
 
 # Version information
-__version__ = "2.5.20"
+__version__ = "2.5.21"
 
 import os
 import warnings