fix the tokenizer.padding_side='right' bug;

lawrence-cj · lawrence-cj · commit b4af50d67f83 · 2024-12-13T02:10:32.000-08:00
diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py
@@ -165,7 +165,7 @@ def encode_prompt(
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         clean_caption: bool = False,
         max_sequence_length: int = 300,
-        complex_human_instruction=None,
+        complex_human_instruction: Optional[List[str]] = None,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
@@ -206,6 +206,8 @@ def encode_prompt(
         else:
             batch_size = prompt_embeds.shape[0]
 
+        self.tokenizer.padding_side = "right"
+
         # See Section 3.1. of the paper.
         max_length = max_sequence_length
         select_index = [0] + list(range(-max_length + 1, 0))
@@ -325,7 +327,7 @@ def check_inputs(
         negative_prompt_attention_mask=None,
     ):
         if height % 32 != 0 or width % 32 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+            raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.")
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -581,7 +583,7 @@ def __call__(
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 300,
         complex_human_instruction: list[str] = [
-            'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:',
+            "Given a user prompt, generate an 'Enhanced prompt' that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:",
             "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.",
             "- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.",
             "Here are examples of how to transform or refine prompts:",