update Sana for DC-AE's recent commit;

lawrence-cj · lawrence-cj · commit abee1eecf648 · 2024-12-09T18:33:04.000+08:00
diff --git a/scripts/convert_sana_pag_to_diffusers.py b/scripts/convert_sana_pag_to_diffusers.py
@@ -9,8 +9,7 @@
 from accelerate import init_empty_weights
 from diffusers import (
     DCAE,
-    DCAE_HF,
-    FlowDPMSolverMultistepScheduler,
+    DPMSolverMultistepScheduler,
     FlowMatchEulerDiscreteScheduler,
     SanaPAGPipeline,
     SanaTransformer2DModel,
@@ -186,27 +185,10 @@ def main(args):
     else:
         print(colored(f"Saving the whole SanaPAGPipeline containing {args.model_type}", "green", attrs=["bold"]))
         # VAE
-        dc_ae = DCAE_HF.from_pretrained(f"mit-han-lab/dc-ae-f32c32-sana-1.0")
-        dc_ae_state_dict = dc_ae.state_dict()
-        dc_ae = DCAE(
-            in_channels=3,
-            latent_channels=32,
-            encoder_width_list=[128, 256, 512, 512, 1024, 1024],
-            encoder_depth_list=[2, 2, 2, 3, 3, 3],
-            encoder_block_type=["ResBlock", "ResBlock", "ResBlock", "EViTS5_GLU", "EViTS5_GLU", "EViTS5_GLU"],
-            encoder_norm="rms2d",
-            encoder_act="silu",
-            downsample_block_type="Conv",
-            decoder_width_list=[128, 256, 512, 512, 1024, 1024],
-            decoder_depth_list=[3, 3, 3, 3, 3, 3],
-            decoder_block_type=["ResBlock", "ResBlock", "ResBlock", "EViTS5_GLU", "EViTS5_GLU", "EViTS5_GLU"],
-            decoder_norm="rms2d",
-            decoder_act="silu",
-            upsample_block_type="InterpolateConv",
-            scaling_factor=0.41407,
-        )
-        dc_ae.load_state_dict(dc_ae_state_dict, strict=True)
-        dc_ae.to(torch.float32).to(device)
+        dc_ae = DCAE.from_pretrained(
+            "Efficient-Large-Model/dc_ae_f32c32_sana_1.0_diffusers",
+            torch_dtype=torch.float32,
+        ).to(device)
 
         # Text Encoder
         text_encoder_model_path = "google/gemma-2-2b-it"
@@ -220,7 +202,11 @@ def main(args):
 
         # Scheduler
         if args.scheduler_type == "flow-dpm_solver":
-            scheduler = FlowDPMSolverMultistepScheduler(flow_shift=flow_shift)
+            scheduler = DPMSolverMultistepScheduler(
+                flow_shift=flow_shift, 
+                use_flow_sigmas=True,
+                prediction_type="flow_prediction",
+            )
         elif args.scheduler_type == "flow-euler":
             scheduler = FlowMatchEulerDiscreteScheduler(shift=flow_shift)
         else:
diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
@@ -9,8 +9,7 @@
 from accelerate import init_empty_weights
 from diffusers import (
     DCAE,
-    DCAE_HF,
-    FlowDPMSolverMultistepScheduler,
+    DPMSolverMultistepScheduler,
     FlowMatchEulerDiscreteScheduler,
     SanaPipeline,
     SanaTransformer2DModel,
@@ -186,27 +185,10 @@ def main(args):
     else:
         print(colored(f"Saving the whole SanaPipeline containing {args.model_type}", "green", attrs=["bold"]))
         # VAE
-        dc_ae = DCAE_HF.from_pretrained(f"mit-han-lab/dc-ae-f32c32-sana-1.0")
-        dc_ae_state_dict = dc_ae.state_dict()
-        dc_ae = DCAE(
-            in_channels=3,
-            latent_channels=32,
-            encoder_width_list=[128, 256, 512, 512, 1024, 1024],
-            encoder_depth_list=[2, 2, 2, 3, 3, 3],
-            encoder_block_type=["ResBlock", "ResBlock", "ResBlock", "EViTS5_GLU", "EViTS5_GLU", "EViTS5_GLU"],
-            encoder_norm="rms2d",
-            encoder_act="silu",
-            downsample_block_type="Conv",
-            decoder_width_list=[128, 256, 512, 512, 1024, 1024],
-            decoder_depth_list=[3, 3, 3, 3, 3, 3],
-            decoder_block_type=["ResBlock", "ResBlock", "ResBlock", "EViTS5_GLU", "EViTS5_GLU", "EViTS5_GLU"],
-            decoder_norm="rms2d",
-            decoder_act="silu",
-            upsample_block_type="InterpolateConv",
-            scaling_factor=0.41407,
-        )
-        dc_ae.load_state_dict(dc_ae_state_dict, strict=True)
-        dc_ae.to(torch.float32).to(device)
+        dc_ae = DCAE.from_pretrained(
+            "Efficient-Large-Model/dc_ae_f32c32_sana_1.0_diffusers",
+            torch_dtype=torch.float32,
+        ).to(device)
 
         # Text Encoder
         text_encoder_model_path = "google/gemma-2-2b-it"
@@ -220,7 +202,11 @@ def main(args):
 
         # Scheduler
         if args.scheduler_type == "flow-dpm_solver":
-            scheduler = FlowDPMSolverMultistepScheduler(flow_shift=flow_shift)
+            scheduler = DPMSolverMultistepScheduler(
+                flow_shift=flow_shift, 
+                use_flow_sigmas=True,
+                prediction_type="flow_prediction",
+            )
         elif args.scheduler_type == "flow-euler":
             scheduler = FlowMatchEulerDiscreteScheduler(shift=flow_shift)
         else:
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -131,7 +131,6 @@
             "UVit2DModel",
             "VQModel",
             "DCAE",
-            "DCAE_HF",
         ]
     )
     _import_structure["optimization"] = [
@@ -577,7 +576,6 @@
     else:
         from .models import (
             DCAE,
-            DCAE_HF,
             AllegroTransformer3DModel,
             AsymmetricAutoencoderKL,
             AuraFlowTransformer2DModel,
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
@@ -8,5 +8,5 @@
 from .autoencoder_oobleck import AutoencoderOobleck
 from .autoencoder_tiny import AutoencoderTiny
 from .consistency_decoder_vae import ConsistencyDecoderVAE
-from .dc_ae import DCAE, DCAE_HF
+from .autoencoder_dc import DCAE
 from .vq_model import VQModel
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sana.py b/src/diffusers/pipelines/pag/pipeline_pag_sana.py
@@ -22,7 +22,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from ...image_processor import PixArtImageProcessor
-from ...models import DCAE_HF, SanaTransformer2DModel
+from ...models import DCAE, SanaTransformer2DModel
 from ...models.attention_processor import PAGCFGSanaLinearAttnProcessor2_0, PAGIdentitySanaLinearAttnProcessor2_0
 from ...schedulers import FlowDPMSolverMultistepScheduler
 from ...utils import (
@@ -162,7 +162,7 @@ def __init__(
         self,
         tokenizer: AutoTokenizer,
         text_encoder: AutoModelForCausalLM,
-        vae: DCAE_HF,
+        vae: DCAE,
         transformer: SanaTransformer2DModel,
         scheduler: FlowDPMSolverMultistepScheduler,
         pag_applied_layers: Union[str, List[str]] = "blocks.1",  # 1st transformer block
@@ -840,22 +840,27 @@ def __call__(
                     noise_pred = noise_pred
 
                 # compute previous image: x_t -> x_t-1
+                latents_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         step_idx = i // getattr(self.scheduler, "order", 1)
                         callback(step_idx, t, latents)
-            # set to None for next
 
-        if not output_type == "latent":
-            image = self.vae.decode(latents.to(self.vae.dtype) / self.vae.config.scaling_factor)
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = latents.to(self.vae.dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
             if use_resolution_binning:
                 image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
-        else:
-            image = latents
 
         if not output_type == "latent":
             image = self.image_processor.postprocess(image, output_type=output_type)
diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -22,7 +22,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from ...image_processor import PixArtImageProcessor
-from ...models import DCAE_HF, SanaTransformer2DModel
+from ...models import DCAE, SanaTransformer2DModel
 from ...schedulers import FlowDPMSolverMultistepScheduler
 from ...utils import (
     BACKENDS_MAPPING,
@@ -157,7 +157,7 @@ def __init__(
         self,
         tokenizer: AutoTokenizer,
         text_encoder: AutoModelForCausalLM,
-        vae: DCAE_HF,
+        vae: DCAE,
         transformer: SanaTransformer2DModel,
         scheduler: FlowDPMSolverMultistepScheduler,
     ):
@@ -793,23 +793,27 @@ def __call__(
                     noise_pred = noise_pred
 
                 # compute previous image: x_t -> x_t-1
+                latents_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         step_idx = i // getattr(self.scheduler, "order", 1)
                         callback(step_idx, t, latents)
 
-        if not output_type == "latent":
-            # image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-            # Temporary for DCAE_HF(the not ready version)
-            image = self.vae.decode(latents.to(self.vae.dtype) / self.vae.config.scaling_factor)
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = latents.to(self.vae.dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
             if use_resolution_binning:
                 image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
-        else:
-            image = latents
 
         if not output_type == "latent":
             image = self.image_processor.postprocess(image, output_type=output_type)