huggingface · yiyixuxu · Jan 14, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 12, 2025
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sana.py b/src/diffusers/pipelines/pag/pipeline_pag_sana.py
@@ -41,6 +41,7 @@
     ASPECT_RATIO_1024_BIN,
 )
 from ..pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN
+from ..sana.pipeline_sana import ASPECT_RATIO_4096_BIN
 from .pag_utils import PAGMixin
 
 
@@ -639,7 +640,7 @@ def __call__(
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        clean_caption: bool = True,
+        clean_caption: bool = False,
         use_resolution_binning: bool = True,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
@@ -755,7 +756,9 @@ def __call__(
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
         if use_resolution_binning:
-            if self.transformer.config.sample_size == 64:
+            if self.transformer.config.sample_size == 128:
+                aspect_ratio_bin = ASPECT_RATIO_4096_BIN
+            elif self.transformer.config.sample_size == 64:
                 aspect_ratio_bin = ASPECT_RATIO_2048_BIN
             elif self.transformer.config.sample_size == 32:
                 aspect_ratio_bin = ASPECT_RATIO_1024_BIN
@@ -912,7 +915,13 @@ def __call__(
             image = latents
         else:
             latents = latents.to(self.vae.dtype)
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            try:
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            except torch.cuda.OutOfMemoryError as e:
+                print("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
+                self.vae.enable_tiling(tile_sample_min_width=1024, tile_sample_min_height=1024)
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                self.vae.disable_tiling()
             if use_resolution_binning:
                 image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
 

diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -953,7 +953,13 @@ def __call__(
             image = latents
         else:
             latents = latents.to(self.vae.dtype)
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            try:
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            except torch.cuda.OutOfMemoryError as e:
+                print("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
+                self.vae.enable_tiling(tile_sample_min_width=1024, tile_sample_min_height=1024)
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                self.vae.disable_tiling()
             if use_resolution_binning:
                 image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)