make style

a-r-r-o-w · a-r-r-o-w · commit 9d314263459b · 2024-12-11T23:41:41.000+01:00
diff --git a/scripts/convert_sana_pag_to_diffusers.py b/scripts/convert_sana_pag_to_diffusers.py
@@ -187,8 +187,9 @@ def main(args):
     try:
         state_dict.pop("y_embedder.y_embedding")
         state_dict.pop("pos_embed")
-    except:
-        pass
+    except KeyError:
+        print("y_embedder.y_embedding or pos_embed not found in the state_dict")
+
     assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
 
     num_model_params = sum(p.numel() for p in transformer.parameters())
diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
@@ -38,7 +38,7 @@
 def main(args):
     ckpt_id = ckpt_ids[0]
     cache_dir_path = os.path.expanduser("~/.cache/huggingface/hub")
-    
+
     if args.orig_ckpt_path is None:
         snapshot_download(
             repo_id=ckpt_id,
@@ -53,7 +53,7 @@ def main(args):
         )
     else:
         file_path = args.orig_ckpt_path
-    
+
     all_state_dict = torch.load(file_path, weights_only=True)
     state_dict = all_state_dict.pop("state_dict")
     converted_state_dict = {}
@@ -98,7 +98,7 @@ def main(args):
         converted_state_dict[f"transformer_blocks.{depth}.scale_shift_table"] = state_dict.pop(
             f"blocks.{depth}.scale_shift_table"
         )
-        
+
         # Linear Attention is all you need 🤘
         # Self attention.
         q, k, v = torch.chunk(state_dict.pop(f"blocks.{depth}.attn.qkv.weight"), 3, dim=0)
@@ -182,8 +182,9 @@ def main(args):
     try:
         state_dict.pop("y_embedder.y_embedding")
         state_dict.pop("pos_embed")
-    except:
-        pass
+    except KeyError:
+        print("y_embedder.y_embedding or pos_embed not found in the state_dict")
+
     assert len(state_dict) == 0, f"State dict is not empty, {state_dict.keys()}"
 
     num_model_params = sum(p.numel() for p in transformer.parameters())
@@ -198,11 +199,15 @@ def main(args):
                 attrs=["bold"],
             )
         )
-        transformer.save_pretrained(os.path.join(args.dump_path, "transformer"), safe_serialization=True, max_shard_size="5GB", variant=variant)
+        transformer.save_pretrained(
+            os.path.join(args.dump_path, "transformer"), safe_serialization=True, max_shard_size="5GB", variant=variant
+        )
     else:
         print(colored(f"Saving the whole SanaPipeline containing {args.model_type}", "green", attrs=["bold"]))
         # VAE
-        ae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers",)
+        ae = AutoencoderDC.from_pretrained(
+            "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers",
+        )
 
         # Text Encoder
         text_encoder_model_path = "google/gemma-2-2b-it"
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -111,8 +111,8 @@
             "MultiAdapter",
             "MultiControlNetModel",
             "PixArtTransformer2DModel",
-            "SanaTransformer2DModel",
             "PriorTransformer",
+            "SanaTransformer2DModel",
             "SD3ControlNetModel",
             "SD3MultiControlNetModel",
             "SD3Transformer2DModel",
@@ -180,12 +180,12 @@
             "DEISMultistepScheduler",
             "DPMSolverMultistepInverseScheduler",
             "DPMSolverMultistepScheduler",
-            "FlowDPMSolverMultistepScheduler",
             "DPMSolverSinglestepScheduler",
             "EDMDPMSolverMultistepScheduler",
             "EDMEulerScheduler",
             "EulerAncestralDiscreteScheduler",
             "EulerDiscreteScheduler",
+            "FlowDPMSolverMultistepScheduler",
             "FlowMatchEulerDiscreteScheduler",
             "FlowMatchHeunDiscreteScheduler",
             "HeunDiscreteScheduler",
@@ -330,8 +330,8 @@
             "PixArtSigmaPAGPipeline",
             "PixArtSigmaPipeline",
             "ReduxImageEncoder",
-            "SanaPipeline",
             "SanaPAGPipeline",
+            "SanaPipeline",
             "SemanticStableDiffusionPipeline",
             "ShapEImg2ImgPipeline",
             "ShapEPipeline",
@@ -345,8 +345,8 @@
             "StableDiffusion3Img2ImgPipeline",
             "StableDiffusion3InpaintPipeline",
             "StableDiffusion3PAGImg2ImgPipeline",
-            "StableDiffusion3PAGPipeline",
             "StableDiffusion3PAGImg2ImgPipeline",
+            "StableDiffusion3PAGPipeline",
             "StableDiffusion3Pipeline",
             "StableDiffusionAdapterPipeline",
             "StableDiffusionAttendAndExcitePipeline",
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -58,8 +58,8 @@
     _import_structure["transformers.latte_transformer_3d"] = ["LatteTransformer3DModel"]
     _import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
     _import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
-    _import_structure["transformers.sana_transformer"] = ["SanaTransformer2DModel"]
     _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
+    _import_structure["transformers.sana_transformer"] = ["SanaTransformer2DModel"]
     _import_structure["transformers.stable_audio_transformer"] = ["StableAudioDiTModel"]
     _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -5359,12 +5359,9 @@ def __call__(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        input_ndim = hidden_states.ndim
         original_dtype = hidden_states.dtype
 
-        batch_size, _, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
@@ -5391,7 +5388,7 @@ def __call__(
 
         if hidden_states.dtype in [torch.float16, torch.bfloat16]:
             hidden_states = hidden_states.float()
-        
+
         hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps)
         hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1)
         hidden_states = hidden_states.to(original_dtype)
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
@@ -8,5 +8,4 @@
 from .autoencoder_oobleck import AutoencoderOobleck
 from .autoencoder_tiny import AutoencoderTiny
 from .consistency_decoder_vae import ConsistencyDecoderVAE
-from .autoencoder_dc import AutoencoderDC
 from .vq_model import VQModel
diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from functools import partial
-from typing import Any, Dict, Optional, Union
+from typing import Dict, Optional, Union
 
 import torch
 from torch import nn
@@ -25,7 +24,6 @@
     Attention,
     AttentionProcessor,
     AttnProcessor2_0,
-    SanaMultiscaleLinearAttention,
     SanaLinearAttnProcessor2_0,
 )
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection
@@ -135,7 +133,7 @@ def __init__(
             mlp_ratio=mlp_ratio,
         )
 
-        self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim ** 0.5)
+        self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
 
     def forward(
         self,
@@ -152,7 +150,7 @@ def forward(
         shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
             self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
         ).chunk(6, dim=1)
-        
+
         # 2. Self Attention
         norm_hidden_states = self.norm1(hidden_states)
         norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
@@ -258,9 +256,7 @@ def __init__(
         )
 
         # 2. Caption Embedding
-        self.caption_projection = PixArtAlphaTextProjection(
-            in_features=caption_channels, hidden_size=inner_dim
-        )
+        self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
         self.caption_norm = RMSNorm(inner_dim, eps=1e-5)
 
         # 3. Transformer blocks
@@ -285,7 +281,7 @@ def __init__(
 
         # 4. Output blocks
         self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
-        
+
         self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
         self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels)
 
@@ -401,12 +397,12 @@ def forward(
 
         encoder_hidden_states = self.caption_projection(encoder_hidden_states)
         encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
-        
+
         encoder_hidden_states = self.caption_norm(encoder_hidden_states)
 
         # 2. Transformer blocks
         use_reentrant = is_torch_version("<=", "1.11.0")
-        
+
         def create_block_forward(block):
             if torch.is_grad_enabled() and self.gradient_checkpointing:
                 return lambda *inputs: torch.utils.checkpoint.checkpoint(
@@ -430,16 +426,23 @@ def create_block_forward(block):
             self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)
         ).chunk(2, dim=1)
         hidden_states = self.norm_out(hidden_states)
-        
+
         # 4. Modulation
         hidden_states = hidden_states * (1 + scale) + shift
         hidden_states = self.proj_out(hidden_states)
 
         # 5. Unpatchify
-        hidden_states = hidden_states.reshape(batch_size, post_patch_height, post_patch_width, self.config.patch_size, self.config.patch_size, -1)
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_height, post_patch_width, self.config.patch_size, self.config.patch_size, -1
+        )
         hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4)
         output = hidden_states.reshape(
-            shape=(batch_size, -1, post_patch_height * self.config.patch_size, post_patch_width * self.config.patch_size)
+            shape=(
+                batch_size,
+                -1,
+                post_patch_height * self.config.patch_size,
+                post_patch_width * self.config.patch_size,
+            )
         )
 
         if not return_dict:
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sana.py b/src/diffusers/pipelines/pag/pipeline_pag_sana.py
@@ -176,7 +176,8 @@ def __init__(
         self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
         self.set_pag_applied_layers(
-            pag_applied_layers, pag_attn_processors=(PAGCFGSanaLinearAttnProcessor2_0(), PAGIdentitySanaLinearAttnProcessor2_0())
+            pag_applied_layers,
+            pag_attn_processors=(PAGCFGSanaLinearAttnProcessor2_0(), PAGIdentitySanaLinearAttnProcessor2_0()),
         )
 
     # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt with 120->300
diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -27,7 +27,6 @@
 from ...schedulers import FlowDPMSolverMultistepScheduler
 from ...utils import (
     BACKENDS_MAPPING,
-    deprecate,
     is_bs4_available,
     is_ftfy_available,
     logging,
@@ -59,9 +58,7 @@
         >>> from diffusers import SanaPipeline
 
         >>> # You can replace the checkpoint id with "Sana_1600M_1024px/Sana_1600M_1024px" too.
-        >>> pipe = SanaPipeline.from_pretrained(
-        ...     "Sana_1600M_1024px/Sana_1600M_1024px", torch_dtype=torch.float16
-        ... )
+        >>> pipe = SanaPipeline.from_pretrained("Sana_1600M_1024px/Sana_1600M_1024px", torch_dtype=torch.float16)
         >>> # Enable memory optimizations.
         >>> # pipe.enable_model_cpu_offload()
 
@@ -171,7 +168,9 @@ def __init__(
         )
 
         self.vae_scale_factor = (
-            2 ** (len(self.vae.config.encoder_block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 32
+            2 ** (len(self.vae.config.encoder_block_out_channels) - 1)
+            if hasattr(self, "vae") and self.vae is not None
+            else 32
         )
         self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
@@ -650,13 +649,13 @@ def __call__(
 
         Returns:
             [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] is returned, otherwise a `tuple` is
-                returned where the first element is a list with the generated images
+                If `return_dict` is `True`, [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images
         """
 
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-        
+
         # 1. Check inputs. Raise error if not correct
         if use_resolution_binning:
             if self.transformer.config.sample_size == 64:
@@ -778,6 +777,17 @@ def __call__(
                     if torch.backends.mps.is_available():
                         # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
                         latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
@@ -52,9 +52,9 @@
     _import_structure["scheduling_deis_multistep"] = ["DEISMultistepScheduler"]
     _import_structure["scheduling_dpm_cogvideox"] = ["CogVideoXDPMScheduler"]
     _import_structure["scheduling_dpmsolver_multistep"] = ["DPMSolverMultistepScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep_flow"] = ["FlowDPMSolverMultistepScheduler"]
     _import_structure["scheduling_dpmsolver_multistep_inverse"] = ["DPMSolverMultistepInverseScheduler"]
     _import_structure["scheduling_dpmsolver_singlestep"] = ["DPMSolverSinglestepScheduler"]
-    _import_structure["scheduling_dpmsolver_multistep_flow"] = ["FlowDPMSolverMultistepScheduler"]
     _import_structure["scheduling_edm_dpmsolver_multistep"] = ["EDMDPMSolverMultistepScheduler"]
     _import_structure["scheduling_edm_euler"] = ["EDMEulerScheduler"]
     _import_structure["scheduling_euler_ancestral_discrete"] = ["EulerAncestralDiscreteScheduler"]
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flow.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flow.py
@@ -45,8 +45,8 @@ class FlowDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
             sampling, and `solver_order=3` for unconditional sampling. This affects the number of model outputs stored
             and used in multistep updates.
         prediction_type (`str`, defaults to "flow_prediction"):
-            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
-            the flow of the diffusion process.
+            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts the
+            flow of the diffusion process.
         flow_shift (`float`, *optional*, defaults to 1.0):
             A factor used to adjust the sigmas in the noise schedule. It modifies the step sizes during the sampling
             process.
diff --git a/tests/pipelines/sana/test_sana.py b/tests/pipelines/sana/test_sana.py
@@ -12,28 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
 import inspect
 import unittest
 
 import numpy as np
 import torch
-from transformers import AutoTokenizer, Gemma2ForCausalLM, Gemma2Config, GemmaTokenizer
+from transformers import Gemma2Config, Gemma2ForCausalLM, GemmaTokenizer
 
 from diffusers import AutoencoderDC, FlowMatchEulerDiscreteScheduler, SanaPipeline, SanaTransformer2DModel
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    numpy_cosine_similarity_distance,
-    require_torch_gpu,
-    slow,
-    torch_device
-)
+from diffusers.utils.testing_utils import enable_full_determinism, torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import (
-    PipelineTesterMixin,
-    to_np
-)
+from ..test_pipelines_common import PipelineTesterMixin, to_np
 
 
 enable_full_determinism()
@@ -114,7 +104,7 @@ def get_dummy_components(self):
             num_hidden_layers=1,
             num_key_value_heads=2,
             vocab_size=8,
-            attn_implementation='eager',
+            attn_implementation="eager",
         )
         text_encoder = Gemma2ForCausalLM(text_encoder_config)
         tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")

Original file line number	Diff line number	Diff line change
`@@ -176,7 +176,8 @@ def __init__(`
`176`	`176`	`self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)`
`177`	`177`
`178`	`178`	`self.set_pag_applied_layers(`
`179`		`- pag_applied_layers, pag_attn_processors=(PAGCFGSanaLinearAttnProcessor2_0(), PAGIdentitySanaLinearAttnProcessor2_0())`
	`179`	`+ pag_applied_layers,`
	`180`	`+ pag_attn_processors=(PAGCFGSanaLinearAttnProcessor2_0(), PAGIdentitySanaLinearAttnProcessor2_0()),`
`180`	`181`	`)`
`181`	`182`
`182`	`183`	`# Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt with 120->300`