huggingface · DN6 · Dec 23, 2024 · Dec 22, 2024 · Dec 22, 2024 · Dec 22, 2024
diff --git a/docs/source/en/api/models/sana_transformer2d.md b/docs/source/en/api/models/sana_transformer2d.md
@@ -22,7 +22,7 @@ The model can be loaded with the following code snippet.
 ```python
 from diffusers import SanaTransformer2DModel
 
-transformer = SanaTransformer2DModel.from_pretrained("Efficient-Large-Model/Sana_1600M_1024px_diffusers", subfolder="transformer", torch_dtype=torch.float16)
+transformer = SanaTransformer2DModel.from_pretrained("Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
 ```
 
 ## SanaTransformer2DModel

diff --git a/docs/source/en/api/pipelines/sana.md b/docs/source/en/api/pipelines/sana.md
@@ -32,9 +32,9 @@ Available models:
 
 | Model | Recommended dtype |
 |:-----:|:-----------------:|
+| [`Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers) | `torch.bfloat16` |
 | [`Efficient-Large-Model/Sana_1600M_1024px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_diffusers) | `torch.float16` |
 | [`Efficient-Large-Model/Sana_1600M_1024px_MultiLing_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing_diffusers) | `torch.float16` |
-| [`Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers) | `torch.bfloat16` |
 | [`Efficient-Large-Model/Sana_1600M_512px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_diffusers) | `torch.float16` |
 | [`Efficient-Large-Model/Sana_1600M_512px_MultiLing_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_MultiLing_diffusers) | `torch.float16` |
 | [`Efficient-Large-Model/Sana_600M_1024px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px_diffusers) | `torch.float16` |

diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
@@ -88,13 +88,18 @@ def main(args):
     # y norm
     converted_state_dict["caption_norm.weight"] = state_dict.pop("attention_y_norm.weight")
 
+    # scheduler
     flow_shift = 3.0
+
+    # model config
     if args.model_type == "SanaMS_1600M_P1_D20":
         layer_num = 20
     elif args.model_type == "SanaMS_600M_P1_D28":
         layer_num = 28
     else:
         raise ValueError(f"{args.model_type} is not supported.")
+    # Positional embedding interpolation scale.
+    interpolation_scale = {512: None, 1024: None, 2048: 1.0}
 
     for depth in range(layer_num):
         # Transformer blocks.
@@ -176,6 +181,7 @@ def main(args):
             patch_size=1,
             norm_elementwise_affine=False,
             norm_eps=1e-6,
+            interpolation_scale=interpolation_scale[args.image_size],
         )
 
     if is_accelerate_available():

diff --git a/src/diffusers/models/transformers/sana_transformer.py b/src/diffusers/models/transformers/sana_transformer.py
@@ -242,21 +242,26 @@ def __init__(
         patch_size: int = 1,
         norm_elementwise_affine: bool = False,
         norm_eps: float = 1e-6,
+        interpolation_scale: Optional[int] = None,
     ) -> None:
         super().__init__()
 
         out_channels = out_channels or in_channels
         inner_dim = num_attention_heads * attention_head_dim
 
         # 1. Patch Embedding
+        interpolation_scale = (
+            interpolation_scale
+            if interpolation_scale is not None
+            else max(sample_size // 64, 1)
+        )
         self.patch_embed = PatchEmbed(
             height=sample_size,
             width=sample_size,
             patch_size=patch_size,
             in_channels=in_channels,
             embed_dim=inner_dim,
-            interpolation_scale=None,
-            pos_embed_type=None,
+            interpolation_scale=interpolation_scale,
         )
 
         # 2. Additional condition embeddings

diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sana.py b/src/diffusers/pipelines/pag/pipeline_pag_sana.py
@@ -59,13 +59,13 @@
         >>> from diffusers import SanaPAGPipeline
 
         >>> pipe = SanaPAGPipeline.from_pretrained(
-        ...     "Efficient-Large-Model/Sana_1600M_1024px_diffusers",
+        ...     "Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers",
         ...     pag_applied_layers=["transformer_blocks.8"],
         ...     torch_dtype=torch.float32,
         ... )
         >>> pipe.to("cuda")
         >>> pipe.text_encoder.to(torch.bfloat16)
-        >>> pipe.transformer = pipe.transformer.to(torch.float16)
+        >>> pipe.transformer = pipe.transformer.to(torch.bfloat16)
 
         >>> image = pipe(prompt='a cyberpunk cat with a neon sign that says "Sana"')[0]
         >>> image[0].save("output.png")

diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -62,11 +62,11 @@
         >>> from diffusers import SanaPipeline
 
         >>> pipe = SanaPipeline.from_pretrained(
-        ...     "Efficient-Large-Model/Sana_1600M_1024px_diffusers", torch_dtype=torch.float32
+        ...     "Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers", torch_dtype=torch.float32
         ... )
         >>> pipe.to("cuda")
         >>> pipe.text_encoder.to(torch.bfloat16)
-        >>> pipe.transformer = pipe.transformer.to(torch.float16)
+        >>> pipe.transformer = pipe.transformer.to(torch.bfloat16)
 
         >>> image = pipe(prompt='a cyberpunk cat with a neon sign that says "Sana"')[0]
         >>> image[0].save("output.png")