1. fix bugs and run convert script success;

lawrence-cj · lawrence-cj · commit ad3935f96815 · 2024-12-09T18:33:05.000+08:00
2. Downloading ckpt from hub automatically;
diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
@@ -19,16 +19,39 @@
 )
 from diffusers.models.modeling_utils import load_model_dict_into_meta
 from diffusers.utils.import_utils import is_accelerate_available
-
+from huggingface_hub import hf_hub_download, snapshot_download
 
 CTX = init_empty_weights if is_accelerate_available else nullcontext
 
-ckpt_id = "Sana"
+ckpt_ids = [
+    "Efficient-Large-Model/Sana_1600M_1024px_MultiLing",
+    "Efficient-Large-Model/Sana_1600M_512px_MultiLing",
+    "Efficient-Large-Model/Sana_1600M_1024px",
+    "Efficient-Large-Model/Sana_1600M_512px",
+    "Efficient-Large-Model/Sana_600M_1024px",
+    "Efficient-Large-Model/Sana_600M_512px",
+]
 # https://github.com/NVlabs/Sana/blob/main/scripts/inference.py
 
 
 def main(args):
-    all_state_dict = torch.load(args.orig_ckpt_path, map_location=torch.device("cpu"))
+    ckpt_id = ckpt_ids[0]
+    cache_dir_path = os.path.expanduser("~/.cache/huggingface/hub")
+    if args.orig_ckpt_path is None:
+        snapshot_download(
+            repo_id=ckpt_id,
+            cache_dir=cache_dir_path,
+            repo_type="model",
+        )
+        file_path = hf_hub_download(
+            repo_id=ckpt_id,
+            filename=f"checkpoints/{ckpt_id.split('/')[-1]}.pth",
+            cache_dir=cache_dir_path,
+            repo_type="model",
+        )
+    else:
+        file_path = args.orig_ckpt_path
+    all_state_dict = torch.load(file_path, map_location=torch.device("cpu"))
     state_dict = all_state_dict.pop("state_dict")
     converted_state_dict = {}
 
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
@@ -57,7 +57,7 @@
     _import_structure["transformers.latte_transformer_3d"] = ["LatteTransformer3DModel"]
     _import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
     _import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
-    _import_structure["transformers.sana_transformer_2d"] = ["SanaTransformer2DModel"]
+    _import_structure["transformers.sana_transformer"] = ["SanaTransformer2DModel"]
     _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
     _import_structure["transformers.stable_audio_transformer"] = ["StableAudioDiTModel"]
     _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
@@ -8,5 +8,5 @@
 from .autoencoder_oobleck import AutoencoderOobleck
 from .autoencoder_tiny import AutoencoderTiny
 from .consistency_decoder_vae import ConsistencyDecoderVAE
-from .autoencoder_dc import DCAE
+from .autoencoder_dc import AutoencoderDC
 from .vq_model import VQModel
diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py
@@ -594,7 +594,7 @@ def get_normalization(
 
 class RMSNormScaled(nn.Module):
     def __init__(self, dim, eps: float, elementwise_affine: bool = True, scale_factor: float = 1.0, bias: bool = False):
-        super().__init__(dim, eps, elementwise_affine)
+        super().__init__()
         self.weight = nn.Parameter(torch.ones(dim) * scale_factor)
 
         self.eps = eps
diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
@@ -248,7 +248,6 @@ def forward(
 
 
 class SanaTransformer2DModel(ModelMixin, ConfigMixin):
-    # TODO: Change pixart name below
     r"""
     A 2D Transformer model as introduced in Sana family of models (https://arxiv.org/abs/2310.00426,
     https://arxiv.org/abs/2403.04692).
@@ -272,6 +271,8 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin):
             The width of the latent images. This parameter is fixed during training.
         patch_size (int, defaults to 1):
             Size of the patches the model processes, relevant for architectures working on non-sequential data.
+        activation_fn (str, optional, defaults to "gelu-approximate"):
+            Activation function to use in feed-forward networks within Transformer blocks.
         num_embeds_ada_norm (int, optional, defaults to 1000):
             Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
             inference.
@@ -311,6 +312,7 @@ def __init__(
         attention_bias: bool = True,
         sample_size: int = 32,
         patch_size: int = 1,
+        activation_fn: tuple = None,
         num_embeds_ada_norm: Optional[int] = 1000,
         upcast_attention: bool = False,
         norm_type: str = "ada_norm_single",