Merge branch 'main' into cogvideo_dataset_resize__crop

a-r-r-o-w · web-flow · commit ba7bb57ac5be · 2024-10-08T12:32:19.000+05:30
diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
@@ -177,7 +177,7 @@ transformer = FluxTransformer2DModel.from_pretrained(
 ```
 
 > [!TIP]
-> At any point, you can try `print(pipeline.hf_device_map)` to see how the various models are distributed across devices. This is useful for tracking the device placement of the models.
+> At any point, you can try `print(pipeline.hf_device_map)` to see how the various models are distributed across devices. This is useful for tracking the device placement of the models. You can also try `print(transformer.hf_device_map)` to see how the transformer model is sharded across devices.
 
 Add the transformer model to the pipeline for denoising, but set the other model-level components like the text encoders and VAE to `None` because you don't need them yet.
 
diff --git a/examples/controlnet/train_controlnet_sd3.py b/examples/controlnet/train_controlnet_sd3.py
@@ -357,6 +357,11 @@ def parse_args(input_args=None):
         action="store_true",
         help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
     )
+    parser.add_argument(
+        "--upcast_vae",
+        action="store_true",
+        help="Whether or not to upcast vae to fp32",
+    )
     parser.add_argument(
         "--learning_rate",
         type=float,
@@ -1094,7 +1099,10 @@ def load_model_hook(models, input_dir):
         weight_dtype = torch.bfloat16
 
     # Move vae, transformer and text_encoder to device and cast to weight_dtype
-    vae.to(accelerator.device, dtype=torch.float32)
+    if args.upcast_vae:
+        vae.to(accelerator.device, dtype=torch.float32)
+    else:
+        vae.to(accelerator.device, dtype=weight_dtype)
     transformer.to(accelerator.device, dtype=weight_dtype)
     text_encoder_one.to(accelerator.device, dtype=weight_dtype)
     text_encoder_two.to(accelerator.device, dtype=weight_dtype)
diff --git a/src/diffusers/loaders/lora_conversion_utils.py b/src/diffusers/loaders/lora_conversion_utils.py
@@ -632,7 +632,7 @@ def handle_qkv(sds_sd, ait_sd, sds_key, ait_keys, dims=None):
                 new_key += ".lora_B.weight"
 
         # Handle single_blocks
-        elif old_key.startswith("diffusion_model.single_blocks", "single_blocks"):
+        elif old_key.startswith(("diffusion_model.single_blocks", "single_blocks")):
             block_num = re.search(r"single_blocks\.(\d+)", old_key).group(1)
             new_key = f"transformer.single_transformer_blocks.{block_num}"
 
diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py
@@ -251,6 +251,9 @@ def __init__(
             if hasattr(self, "transformer") and self.transformer is not None
             else 128
         )
+        self.patch_size = (
+            self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
+        )
 
     # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds
     def _get_t5_prompt_embeds(
@@ -577,8 +580,14 @@ def check_inputs(
         callback_on_step_end_tensor_inputs=None,
         max_sequence_length=None,
     ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (
+            height % (self.vae_scale_factor * self.patch_size) != 0
+            or width % (self.vae_scale_factor * self.patch_size) != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
+                f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
+            )
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py
@@ -212,6 +212,9 @@ def __init__(
             if hasattr(self, "transformer") and self.transformer is not None
             else 128
         )
+        self.patch_size = (
+            self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
+        )
 
         self.set_pag_applied_layers(
             pag_applied_layers, pag_attn_processors=(PAGCFGJointAttnProcessor2_0(), PAGJointAttnProcessor2_0())
@@ -542,8 +545,14 @@ def check_inputs(
         callback_on_step_end_tensor_inputs=None,
         max_sequence_length=None,
     ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (
+            height % (self.vae_scale_factor * self.patch_size) != 0
+            or width % (self.vae_scale_factor * self.patch_size) != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
+                f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
+            )
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -601,6 +601,7 @@ def load_sub_model(
     variant: str,
     low_cpu_mem_usage: bool,
     cached_folder: Union[str, os.PathLike],
+    use_safetensors: bool,
 ):
     """Helper method to load the module `name` from `library_name` and `class_name`"""
 
@@ -670,6 +671,7 @@ def load_sub_model(
         loading_kwargs["offload_folder"] = offload_folder
         loading_kwargs["offload_state_dict"] = offload_state_dict
         loading_kwargs["variant"] = model_variants.pop(name, None)
+        loading_kwargs["use_safetensors"] = use_safetensors
 
         if from_flax:
             loading_kwargs["from_flax"] = True
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -905,6 +905,7 @@ def load_module(name, value):
                     variant=variant,
                     low_cpu_mem_usage=low_cpu_mem_usage,
                     cached_folder=cached_folder,
+                    use_safetensors=use_safetensors,
                 )
                 logger.info(
                     f"Loaded {name} as {class_name} from `{name}` subfolder of {pretrained_model_name_or_path}."
diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
@@ -203,6 +203,9 @@ def __init__(
             if hasattr(self, "transformer") and self.transformer is not None
             else 128
         )
+        self.patch_size = (
+            self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
+        )
 
     def _get_t5_prompt_embeds(
         self,
@@ -525,8 +528,14 @@ def check_inputs(
         callback_on_step_end_tensor_inputs=None,
         max_sequence_length=None,
     ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (
+            height % (self.vae_scale_factor * self.patch_size) != 0
+            or width % (self.vae_scale_factor * self.patch_size) != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
+                f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
+            )
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
@@ -668,8 +668,9 @@ def __getattr__(cls, key):
 # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
 def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
     """
-    Args:
     Compares a library version to some requirement using a given operation.
+
+    Args:
         library_or_version (`str` or `packaging.version.Version`):
             A library name or a version to check.
         operation (`str`):
@@ -688,8 +689,9 @@ def compare_versions(library_or_version: Union[str, Version], operation: str, re
 # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
 def is_torch_version(operation: str, version: str):
     """
-    Args:
     Compares the current PyTorch version to a given reference with an operation.
+
+    Args:
         operation (`str`):
             A string representation of an operator, such as `">"` or `"<="`
         version (`str`):
@@ -700,8 +702,9 @@ def is_torch_version(operation: str, version: str):
 
 def is_transformers_version(operation: str, version: str):
     """
-    Args:
     Compares the current Transformers version to a given reference with an operation.
+
+    Args:
         operation (`str`):
             A string representation of an operator, such as `">"` or `"<="`
         version (`str`):
@@ -714,8 +717,9 @@ def is_transformers_version(operation: str, version: str):
 
 def is_accelerate_version(operation: str, version: str):
     """
-    Args:
     Compares the current Accelerate version to a given reference with an operation.
+
+    Args:
         operation (`str`):
             A string representation of an operator, such as `">"` or `"<="`
         version (`str`):
@@ -728,8 +732,9 @@ def is_accelerate_version(operation: str, version: str):
 
 def is_peft_version(operation: str, version: str):
     """
-    Args:
     Compares the current PEFT version to a given reference with an operation.
+
+    Args:
         operation (`str`):
             A string representation of an operator, such as `">"` or `"<="`
         version (`str`):
@@ -742,8 +747,9 @@ def is_peft_version(operation: str, version: str):
 
 def is_k_diffusion_version(operation: str, version: str):
     """
-    Args:
     Compares the current k-diffusion version to a given reference with an operation.
+
+    Args:
         operation (`str`):
             A string representation of an operator, such as `">"` or `"<="`
         version (`str`):
@@ -756,8 +762,9 @@ def is_k_diffusion_version(operation: str, version: str):
 
 def get_objects_from_module(module):
     """
-    Args:
     Returns a dict of object names and values in a module, while skipping private/internal objects
+
+    Args:
         module (ModuleType):
             Module to extract the objects from.
 
@@ -775,7 +782,9 @@ def get_objects_from_module(module):
 
 
 class OptionalDependencyNotAvailable(BaseException):
-    """An error indicating that an optional dependency of Diffusers was not found in the environment."""
+    """
+    An error indicating that an optional dependency of Diffusers was not found in the environment.
+    """
 
 
 class _LazyModule(ModuleType):
diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
@@ -169,7 +169,11 @@ def test_modify_padding_mode(self):
 @unittest.skip("We cannot run inference on this model with the current CI hardware")
 # TODO (DN6, sayakpaul): move these tests to a beefier GPU
 class FluxLoRAIntegrationTests(unittest.TestCase):
-    """internal note: The integration slices were obtained on audace."""
+    """internal note: The integration slices were obtained on audace.
+
+    torch: 2.6.0.dev20241006+cu124 with CUDA 12.5. Need the same setup for the
+    assertions to pass.
+    """
 
     num_inference_steps = 10
     seed = 0

Original file line number	Diff line number	Diff line change
`@@ -905,6 +905,7 @@ def load_module(name, value):`
`905`	`905`	`variant=variant,`
`906`	`906`	`low_cpu_mem_usage=low_cpu_mem_usage,`
`907`	`907`	`cached_folder=cached_folder,`
	`908`	`+ use_safetensors=use_safetensors,`
`908`	`909`	`)`
`909`	`910`	`logger.info(`
`910`	`911`	f"Loaded {name} as {class_name} from `{name}` subfolder of {pretrained_model_name_or_path}."