1. remove un-unsed parameters in init;

lawrence-cj · lawrence-cj · commit 7fa435faff8d · 2024-12-09T18:49:02.000+08:00
2. code update;
diff --git a/scripts/convert_sana_pag_to_diffusers.py b/scripts/convert_sana_pag_to_diffusers.py
@@ -7,6 +7,7 @@
 
 import torch
 from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download, snapshot_download
 from termcolor import colored
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -23,12 +24,35 @@
 
 CTX = init_empty_weights if is_accelerate_available else nullcontext
 
-ckpt_id = "Sana"
+ckpt_ids = [
+    "Efficient-Large-Model/Sana_1600M_1024px_MultiLing",
+    "Efficient-Large-Model/Sana_1600M_512px_MultiLing",
+    "Efficient-Large-Model/Sana_1600M_1024px",
+    "Efficient-Large-Model/Sana_1600M_512px",
+    "Efficient-Large-Model/Sana_600M_1024px",
+    "Efficient-Large-Model/Sana_600M_512px",
+]
 # https://github.com/NVlabs/Sana/blob/main/scripts/inference.py
 
 
 def main(args):
-    all_state_dict = torch.load(args.orig_ckpt_path, map_location=torch.device("cpu"))
+    ckpt_id = ckpt_ids[0]
+    cache_dir_path = os.path.expanduser("~/.cache/huggingface/hub")
+    if args.orig_ckpt_path is None:
+        snapshot_download(
+            repo_id=ckpt_id,
+            cache_dir=cache_dir_path,
+            repo_type="model",
+        )
+        file_path = hf_hub_download(
+            repo_id=ckpt_id,
+            filename=f"checkpoints/{ckpt_id.split('/')[-1]}.pth",
+            cache_dir=cache_dir_path,
+            repo_type="model",
+        )
+    else:
+        file_path = args.orig_ckpt_path
+    all_state_dict = torch.load(file_path, weights_only=True)
     state_dict = all_state_dict.pop("state_dict")
     converted_state_dict = {}
 
@@ -143,7 +167,6 @@ def main(args):
             attention_bias=False,
             sample_size=32,
             patch_size=1,
-            activation_fn=("silu", "silu", None),
             upcast_attention=False,
             norm_type="ada_norm_single",
             norm_elementwise_affine=False,
@@ -175,7 +198,7 @@ def main(args):
         print(
             colored(
                 f"Only saving transformer model of {args.model_type}. "
-                f"Set --save_full_pipeline to save the whole SanaPipeline",
+                f"Set --save_full_pipeline to save the whole SanaPAGPipeline",
                 "green",
                 attrs=["bold"],
             )
diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
@@ -52,7 +52,7 @@ def main(args):
         )
     else:
         file_path = args.orig_ckpt_path
-    all_state_dict = torch.load(file_path, map_location=torch.device("cpu"))
+    all_state_dict = torch.load(file_path, weights_only=True)
     state_dict = all_state_dict.pop("state_dict")
     converted_state_dict = {}
 
@@ -167,7 +167,6 @@ def main(args):
             attention_bias=False,
             sample_size=32,
             patch_size=1,
-            activation_fn=("silu", "silu", None),
             upcast_attention=False,
             norm_type="ada_norm_single",
             norm_elementwise_affine=False,
diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
@@ -267,8 +267,6 @@ class SanaTransformer2DModel(ModelMixin, ConfigMixin):
             The width of the latent images. This parameter is fixed during training.
         patch_size (int, defaults to 1):
             Size of the patches the model processes, relevant for architectures working on non-sequential data.
-        activation_fn (str, optional, defaults to "gelu-approximate"):
-            Activation function to use in feed-forward networks within Transformer blocks.
         num_embeds_ada_norm (int, optional, defaults to 1000):
             Number of embeddings for AdaLayerNorm, fixed during training and affects the maximum denoising steps during
             inference.
@@ -308,7 +306,6 @@ def __init__(
         attention_bias: bool = True,
         sample_size: int = 32,
         patch_size: int = 1,
-        activation_fn: tuple = None,
         num_embeds_ada_norm: Optional[int] = 1000,
         upcast_attention: bool = False,
         norm_type: str = "ada_norm_single",