1. add bf16 pth file path;

lawrence-cj · lawrence-cj · commit 47cc04638b15 · 2024-12-12T06:27:10.000-08:00
2. add complex human instruct in pipeline;
diff --git a/scripts/convert_sana_to_diffusers.py b/scripts/convert_sana_to_diffusers.py
@@ -26,6 +26,7 @@
 
 ckpt_ids = [
     "Efficient-Large-Model/Sana_1600M_1024px_MultiLing",
+    "Efficient-Large-Model/Sana_1600M_1024px_BF16",
     "Efficient-Large-Model/Sana_1600M_512px_MultiLing",
     "Efficient-Large-Model/Sana_1600M_1024px",
     "Efficient-Large-Model/Sana_1600M_512px",
@@ -39,7 +40,7 @@ def main(args):
     ckpt_id = ckpt_ids[0]
     cache_dir_path = os.path.expanduser("~/.cache/huggingface/hub")
 
-    if args.orig_ckpt_path is None:
+    if args.orig_ckpt_path is None or args.orig_ckpt_path in ckpt_ids:
         snapshot_download(
             repo_id=ckpt_id,
             cache_dir=cache_dir_path,
@@ -169,7 +170,7 @@ def main(args):
             caption_channels=2304,
             mlp_ratio=2.5,
             attention_bias=False,
-            sample_size=32,
+            sample_size=args.image_size // 32,
             patch_size=1,
             norm_elementwise_affine=False,
             norm_eps=1e-6,
@@ -191,6 +192,8 @@ def main(args):
     num_model_params = sum(p.numel() for p in transformer.parameters())
     print(f"Total number of transformer parameters: {num_model_params}")
 
+    transformer = transformer.to(weight_dtype)
+
     if not args.save_full_pipeline:
         print(
             colored(
@@ -200,7 +203,6 @@ def main(args):
                 attrs=["bold"],
             )
         )
-        transformer = transformer.to(weight_dtype)
         transformer.save_pretrained(
             os.path.join(args.dump_path, "transformer"), safe_serialization=True, max_shard_size="5GB", variant=variant
         )
diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -166,6 +166,7 @@ def encode_prompt(
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         clean_caption: bool = False,
         max_sequence_length: int = 300,
+        complex_huamen_instruction: list[str] = [],
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
@@ -192,6 +193,8 @@ def encode_prompt(
             clean_caption (`bool`, defaults to `False`):
                 If `True`, the function will preprocess and clean the provided caption before encoding.
             max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
+            use_complex_huamen_instruction (`list[str]`, defaults to `complex_huamen_instruction`):
+                If `complex_huamen_instruction` is not empty, the function will use the complex Huamen instruction for the prompt.
         """
 
         if device is None:
@@ -206,13 +209,24 @@ def encode_prompt(
 
         # See Section 3.1. of the paper.
         max_length = max_sequence_length
+        select_index = [0] + list(range(-max_length + 1, 0))
 
         if prompt_embeds is None:
             prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+
+            # prepare complex huamen instruction
+            if not complex_huamen_instruction:
+                max_length_all = max_length
+            else:
+                chi_prompt = "\n".join(complex_huamen_instruction)
+                prompt = [chi_prompt + p for p in prompt]
+                num_chi_prompt_tokens = len(self.tokenizer.encode(chi_prompt))
+                max_length_all = num_chi_prompt_tokens + max_length - 2
+
             text_inputs = self.tokenizer(
                 prompt,
                 padding="max_length",
-                max_length=max_length,
+                max_length=max_length_all,
                 truncation=True,
                 add_special_tokens=True,
                 return_tensors="pt",
@@ -223,7 +237,8 @@ def encode_prompt(
             prompt_attention_mask = prompt_attention_mask.to(device)
 
             prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
-            prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds[0][:, select_index]
+            prompt_attention_mask = prompt_attention_mask[:, select_index]
 
         if self.transformer is not None:
             dtype = self.transformer.dtype
@@ -566,6 +581,16 @@ def __call__(
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 300,
+        complex_human_attention: list[str] = [
+            'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:',
+            '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.',
+            '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.',
+            'Here are examples of how to transform or refine prompts:',
+            '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.',
+            '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.',
+            'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:',
+            'User Prompt: '
+        ],
     ) -> Union[SanaPipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
@@ -644,6 +669,8 @@ def __call__(
                 `._callback_tensor_inputs` attribute of your pipeline class.
             max_sequence_length (`int` defaults to `300`):
                 Maximum sequence length to use with the `prompt`.
+            complex_human_attention (`list[str]`, *optional*):
+                Instructions for complex human attention: https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55.
 
         Examples: