1. change code based on AutoencoderDC;

lawrence-cj · lawrence-cj · commit 297c0e7802d7 · 2024-12-09T18:33:05.000+08:00
2. fix the bug of new GLUMBConv;
3. run success;
diff --git a/scripts/convert_sana_pag_to_diffusers.py b/scripts/convert_sana_pag_to_diffusers.py
@@ -59,12 +59,11 @@ def main(args):
     # y norm
     converted_state_dict["caption_norm.weight"] = state_dict.pop("attention_y_norm.weight")
 
+    flow_shift = 3.0
     if args.model_type == "SanaMS_1600M_P1_D20":
         layer_num = 20
-        flow_shift = 3.0
     elif args.model_type == "SanaMS_600M_P1_D28":
         layer_num = 28
-        flow_shift = 4.0
     else:
         raise ValueError(f"{args.model_type} is not supported.")
 
@@ -89,19 +88,19 @@ def main(args):
         )
 
         # Feed-forward.
-        converted_state_dict[f"transformer_blocks.{depth}.ff.inverted_conv.conv.weight"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.weight"] = state_dict.pop(
             f"blocks.{depth}.mlp.inverted_conv.conv.weight"
         )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.inverted_conv.conv.bias"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.bias"] = state_dict.pop(
             f"blocks.{depth}.mlp.inverted_conv.conv.bias"
         )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.depth_conv.conv.weight"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.weight"] = state_dict.pop(
             f"blocks.{depth}.mlp.depth_conv.conv.weight"
         )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.depth_conv.conv.bias"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.bias"] = state_dict.pop(
             f"blocks.{depth}.mlp.depth_conv.conv.bias"
         )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.point_conv.conv.weight"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_point.weight"] = state_dict.pop(
             f"blocks.{depth}.mlp.point_conv.conv.weight"
         )
 
@@ -156,8 +155,6 @@ def main(args):
             attention_type="default",
             use_pe=False,
             expand_ratio=2.5,
-            ff_bias=(True, True, False),
-            ff_norm=(None, None, None),
         )
     if is_accelerate_available():
         load_model_dict_into_meta(transformer, converted_state_dict)
diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
@@ -59,12 +59,11 @@ def main(args):
     # y norm
     converted_state_dict["caption_norm.weight"] = state_dict.pop("attention_y_norm.weight")
 
+    flow_shift = 3.0
     if args.model_type == "SanaMS_1600M_P1_D20":
         layer_num = 20
-        flow_shift = 3.0
     elif args.model_type == "SanaMS_600M_P1_D28":
         layer_num = 28
-        flow_shift = 4.0
     else:
         raise ValueError(f"{args.model_type} is not supported.")
 
@@ -89,19 +88,19 @@ def main(args):
         )
 
         # Feed-forward.
-        converted_state_dict[f"transformer_blocks.{depth}.ff.inverted_conv.conv.weight"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.weight"] = state_dict.pop(
             f"blocks.{depth}.mlp.inverted_conv.conv.weight"
         )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.inverted_conv.conv.bias"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_inverted.bias"] = state_dict.pop(
             f"blocks.{depth}.mlp.inverted_conv.conv.bias"
         )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.depth_conv.conv.weight"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.weight"] = state_dict.pop(
             f"blocks.{depth}.mlp.depth_conv.conv.weight"
         )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.depth_conv.conv.bias"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_depth.bias"] = state_dict.pop(
             f"blocks.{depth}.mlp.depth_conv.conv.bias"
         )
-        converted_state_dict[f"transformer_blocks.{depth}.ff.point_conv.conv.weight"] = state_dict.pop(
+        converted_state_dict[f"transformer_blocks.{depth}.ff.conv_point.weight"] = state_dict.pop(
             f"blocks.{depth}.mlp.point_conv.conv.weight"
         )
 
@@ -156,8 +155,6 @@ def main(args):
             attention_type="default",
             use_pe=False,
             expand_ratio=2.5,
-            ff_bias=(True, True, False),
-            ff_norm=(None, None, None),
         )
     if is_accelerate_available():
         load_model_dict_into_meta(transformer, converted_state_dict)
@@ -188,8 +185,8 @@ def main(args):
         print(colored(f"Saving the whole SanaPipeline containing {args.model_type}", "green", attrs=["bold"]))
         # VAE
         ae = AutoencoderDC.from_pretrained(
-            "Efficient-Large-Model/dc_ae_f32c32_sana_1.0_diffusers",
-            torch_dtype=torch.float32,
+            "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers",
+            torch_dtype=torch.bfloat16,
         ).to(device)
 
         # Text Encoder
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -5552,6 +5552,11 @@ def __call__(
     CustomDiffusionAttnProcessor2_0,
     SlicedAttnProcessor,
     SlicedAttnAddedKVProcessor,
+    SanaLinearAttnProcessor2_0,
+    SanaMultiscaleLinearAttention,
+    SanaMultiscaleAttnProcessor2_0,
+    SanaMultiscaleAttentionProjection,
+    PAGCFGSanaLinearAttnProcessor2_0,
     IPAdapterAttnProcessor,
     IPAdapterAttnProcessor2_0,
     IPAdapterXFormersAttnProcessor,
@@ -5562,5 +5567,4 @@ def __call__(
     LoRAXFormersAttnProcessor,
     LoRAAttnAddedKVProcessor,
     SanaLinearAttnProcessor2_0,
-    PAGCFGSanaLinearAttnProcessor2_0,
 ]
diff --git a/src/diffusers/models/transformers/sana_transformer_2d.py b/src/diffusers/models/transformers/sana_transformer_2d.py
@@ -26,7 +26,6 @@
     FusedAttnProcessor2_0,
     SanaLinearAttnProcessor2_0,
 )
-from ..autoencoders.autoencoder_dc import GLUMBConv
 from ..embeddings import PatchEmbed, PixArtAlphaTextProjection, SinusoidalPositionalEmbedding
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -58,40 +57,40 @@ def __init__(self, dim, eps: float, elementwise_affine: bool = True, scale_facto
         self.weight = nn.Parameter(torch.ones(dim) * scale_factor)
 
 
-# Modified from diffusers.models.autoencoders.ecae.GLUMBConv
+# Modified from diffusers.models.autoencoders.autoencoder_dc.GLUMBConv
 @maybe_allow_in_graph
 class SanaGLUMBConv(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int) -> None:
+    def __init__(self, in_channels: int, out_channels: int, expand_ratio: float = 2.5) -> None:
         super().__init__()
 
-        hidden_channels = int(2.5 * in_channels)
+        hidden_channels = int(expand_ratio * in_channels)
 
         self.nonlinearity = nn.SiLU()
 
         self.conv_inverted = nn.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
         self.conv_depth = nn.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
         self.conv_point = nn.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
-        self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
 
-    def forward(self, x: torch.Tensor, HW=None) -> torch.Tensor:
-        B, N, C = x.shape
+    def forward(self, hidden_states: torch.Tensor, HW: Optional[tuple[int]] = None) -> torch.Tensor:
+        B, N, C = hidden_states.shape
         if HW is None:
             H = W = int(N**0.5)
         else:
             H, W = HW
 
-        x = x.reshape(B, H, W, C).permute(0, 3, 1, 2)
-        x = self.inverted_conv(x)
-        x = self.depth_conv(x)
+        hidden_states = hidden_states.reshape(B, H, W, C).permute(0, 3, 1, 2)
 
-        x, gate = torch.chunk(x, 2, dim=1)
-        gate = self.glu_act(gate)
-        x = x * gate
+        hidden_states = self.conv_inverted(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
 
-        x = self.point_conv(x)
-        x = x.reshape(B, C, N).permute(0, 2, 1)
+        hidden_states = self.conv_depth(hidden_states)
+        hidden_states, gate = torch.chunk(hidden_states, 2, dim=1)
+        hidden_states = hidden_states * self.nonlinearity(gate)
 
-        return x
+        hidden_states = self.conv_point(hidden_states)
+        hidden_states = hidden_states.reshape(B, C, N).permute(0, 2, 1)
+
+        return hidden_states
 
 
 # Modified from diffusers.models.attention.BasicTransformerBlock
@@ -130,8 +129,6 @@ def __init__(
         use_pe: bool = False,
         num_positional_embeddings: Optional[int] = None,
         expand_ratio: float = 2.5,
-        ff_bias: tuple =(True, True, False),
-        ff_norm: tuple =(None, None, None),
     ):
         super().__init__()
         self.dim = dim
@@ -186,9 +183,6 @@ def __init__(
             in_channels=dim,
             out_channels=dim,
             expand_ratio=expand_ratio,
-            use_bias=ff_bias,
-            norm=ff_norm,
-            act_func=activation_fn,
         )
 
         # 5. Scale-shift for Sana.
@@ -362,8 +356,6 @@ def __init__(
         attention_type: Optional[str] = "default",
         use_pe: Optional[bool] = False,
         expand_ratio=2.5,
-        ff_bias: tuple =(True, True, False),
-        ff_norm: tuple =(None, None, None),
     ):
         super().__init__()
 
@@ -428,8 +420,6 @@ def __init__(
                     norm_eps=self.config.norm_eps,
                     use_pe=self.config.use_pe,
                     expand_ratio=self.config.expand_ratio,
-                    ff_bias=self.config.ff_bias,
-                    ff_norm=self.config.ff_norm,
                 )
                 for _ in range(self.config.num_layers)
             ]
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sana.py b/src/diffusers/pipelines/pag/pipeline_pag_sana.py
@@ -172,7 +172,7 @@ def __init__(
             tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
         )
 
-        self.vae_scale_factor = 2 ** (len(self.vae.config.encoder_width_list) - 1)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.encoder_block_out_channels) - 1)
         self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
         self.set_pag_applied_layers(
diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -168,7 +168,7 @@ def __init__(
         )
 
         self.vae_scale_factor = (
-            2 ** (len(self.vae.config.encoder_width_list) - 1) if hasattr(self, "vae") and self.vae is not None else 32
+            2 ** (len(self.vae.config.encoder_block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 32
         )
         self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
 

Original file line number	Diff line number	Diff line change
`@@ -172,7 +172,7 @@ def __init__(`
`172`	`172`	`tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler`
`173`	`173`	`)`
`174`	`174`
`175`		`- self.vae_scale_factor = 2 ** (len(self.vae.config.encoder_width_list) - 1)`
	`175`	`+ self.vae_scale_factor = 2 ** (len(self.vae.config.encoder_block_out_channels) - 1)`
`176`	`176`	`self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)`
`177`	`177`
`178`	`178`	`self.set_pag_applied_layers(`
Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ def __init__(`
`168`	`168`	`)`
`169`	`169`
`170`	`170`	`self.vae_scale_factor = (`
`171`		`- 2 ** (len(self.vae.config.encoder_width_list) - 1) if hasattr(self, "vae") and self.vae is not None else 32`
	`171`	`+ 2 ** (len(self.vae.config.encoder_block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 32`
`172`	`172`	`)`
`173`	`173`	`self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)`
`174`	`174`