change the ae related code due to the latest update of DCAE branch;

lawrence-cj · lawrence-cj · commit b76493f962d9 · 2024-12-09T18:33:05.000+08:00
diff --git a/scripts/convert_sana_pag_to_diffusers.py b/scripts/convert_sana_pag_to_diffusers.py
@@ -188,8 +188,8 @@ def main(args):
         print(colored(f"Saving the whole SanaPAGPipeline containing {args.model_type}", "green", attrs=["bold"]))
         # VAE
         ae = AutoencoderDC.from_pretrained(
-            "Efficient-Large-Model/dc_ae_f32c32_sana_1.0_diffusers",
-            torch_dtype=torch.float32,
+            "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers",
+            torch_dtype=torch.bfloat16,
         ).to(device)
 
         # Text Encoder
diff --git a/src/diffusers/models/transformers/sana_transformer_2d.py b/src/diffusers/models/transformers/sana_transformer_2d.py
@@ -60,30 +60,18 @@ def __init__(self, dim, eps: float, elementwise_affine: bool = True, scale_facto
 
 # Modified from diffusers.models.autoencoders.ecae.GLUMBConv
 @maybe_allow_in_graph
-class SanaGLUMBConv(GLUMBConv):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size=3,
-        stride=1,
-        mid_channels=None,
-        expand_ratio=2.5,
-        use_bias=False,
-        norm=(None, None, None),
-        act_func=("silu", "silu", None),
-    ):
-        super().__init__(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            mid_channels=mid_channels,
-            expand_ratio=expand_ratio,
-            use_bias=use_bias,
-            norm=norm,
-            act_func=act_func,
-        )
+class SanaGLUMBConv(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int) -> None:
+        super().__init__()
+
+        hidden_channels = int(2.5 * in_channels)
+
+        self.nonlinearity = nn.SiLU()
+
+        self.conv_inverted = nn.Conv2d(in_channels, hidden_channels * 2, 1, 1, 0)
+        self.conv_depth = nn.Conv2d(hidden_channels * 2, hidden_channels * 2, 3, 1, 1, groups=hidden_channels * 2)
+        self.conv_point = nn.Conv2d(hidden_channels, out_channels, 1, 1, 0, bias=False)
+        self.norm = RMSNorm(out_channels, eps=1e-5, elementwise_affine=True, bias=True)
 
     def forward(self, x: torch.Tensor, HW=None) -> torch.Tensor:
         B, N, C = x.shape