huggingface
diff --git a/‎src/diffusers/loaders/lora_pipeline.py‎
Lines changed: 7 additions & 2 deletions b/‎src/diffusers/loaders/lora_pipeline.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/diffusers/models/modeling_utils.py‎
Lines changed: 32 additions & 3 deletions b/‎src/diffusers/models/modeling_utils.py‎
Lines changed: 32 additions & 3 deletions
diff --git a/‎src/diffusers/pipelines/flux/pipeline_flux_inpaint.py‎
Lines changed: 5 additions & 0 deletions b/‎src/diffusers/pipelines/flux/pipeline_flux_inpaint.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/diffusers/pipelines/pipeline_utils.py‎
Lines changed: 5 additions & 8 deletions b/‎src/diffusers/pipelines/pipeline_utils.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎src/diffusers/pipelines/wan/pipeline_wan_video2video.py‎
Lines changed: 2 additions & 7 deletions b/‎src/diffusers/pipelines/wan/pipeline_wan_video2video.py‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎src/diffusers/quantizers/quantization_config.py‎
Lines changed: 12 additions & 10 deletions b/‎src/diffusers/quantizers/quantization_config.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎src/diffusers/utils/dynamic_modules_utils.py‎
Lines changed: 21 additions & 3 deletions b/‎src/diffusers/utils/dynamic_modules_utils.py‎
Lines changed: 21 additions & 3 deletions
diff --git a/‎src/diffusers/utils/testing_utils.py‎
Lines changed: 13 additions & 3 deletions b/‎src/diffusers/utils/testing_utils.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎tests/models/autoencoders/test_models_consistency_decoder_vae.py‎
Lines changed: 3 additions & 2 deletions b/‎tests/models/autoencoders/test_models_consistency_decoder_vae.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/models/unets/test_models_unet_2d.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/models/unets/test_models_unet_2d.py‎
Lines changed: 2 additions & 1 deletion
@@ -81,12 +81,17 @@ def _maybe_dequantize_weight_for_expanded_lora(model, module):
         from ..quantizers.gguf.utils import dequantize_gguf_tensor
 
     is_bnb_4bit_quantized = module.weight.__class__.__name__ == "Params4bit"
+    is_bnb_8bit_quantized = module.weight.__class__.__name__ == "Int8Params"
     is_gguf_quantized = module.weight.__class__.__name__ == "GGUFParameter"
 
     if is_bnb_4bit_quantized and not is_bitsandbytes_available():
         raise ValueError(
             "The checkpoint seems to have been quantized with `bitsandbytes` (4bits). Install `bitsandbytes` to load quantized checkpoints."
         )
+    if is_bnb_8bit_quantized and not is_bitsandbytes_available():
+        raise ValueError(
+            "The checkpoint seems to have been quantized with `bitsandbytes` (8bits). Install `bitsandbytes` to load quantized checkpoints."
+        )
     if is_gguf_quantized and not is_gguf_available():
         raise ValueError(
             "The checkpoint seems to have been quantized with `gguf`. Install `gguf` to load quantized checkpoints."
@@ -97,10 +102,10 @@ def _maybe_dequantize_weight_for_expanded_lora(model, module):
         weight_on_cpu = True
 
     device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
-    if is_bnb_4bit_quantized:
+    if is_bnb_4bit_quantized or is_bnb_8bit_quantized:
         module_weight = dequantize_bnb_weight(
             module.weight.to(device) if weight_on_cpu else module.weight,
-            state=module.weight.quant_state,
+            state=module.weight.quant_state if is_bnb_4bit_quantized else module.state,
             dtype=model.dtype,
         ).data
     elif is_gguf_quantized:
 
@@ -814,14 +814,43 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
                 guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
                 information.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+            device_map (`Union[int, str, torch.device]` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
                 A map that specifies where each submodule should go. It doesn't need to be defined for each
                 parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
                 same device. Defaults to `None`, meaning that the model will be loaded on CPU.
 
+                Examples:
+
+                ```py
+                >>> from diffusers import AutoModel
+                >>> import torch
+
+                >>> # This works.
+                >>> model = AutoModel.from_pretrained(
+                ...     "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="cuda"
+                ... )
+                >>> # This also works (integer accelerator device ID).
+                >>> model = AutoModel.from_pretrained(
+                ...     "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map=0
+                ... )
+                >>> # Specifying a supported offloading strategy like "auto" also works.
+                >>> model = AutoModel.from_pretrained(
+                ...     "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="auto"
+                ... )
+                >>> # Specifying a dictionary as `device_map` also works.
+                >>> model = AutoModel.from_pretrained(
+                ...     "stabilityai/stable-diffusion-xl-base-1.0",
+                ...     subfolder="unet",
+                ...     device_map={"": torch.device("cuda")},
+                ... )
+                ```
+
                 Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
                 more information about each option see [designing a device
-                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+                map](https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap). You
+                can also refer to the [Diffusers-specific
+                documentation](https://huggingface.co/docs/diffusers/main/en/training/distributed_inference#model-sharding)
+                for more concrete examples.
             max_memory (`Dict`, *optional*):
                 A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
                 each GPU and the available CPU RAM if unset.
@@ -1387,7 +1416,7 @@ def _load_pretrained_model(
         low_cpu_mem_usage: bool = True,
         dtype: Optional[Union[str, torch.dtype]] = None,
         keep_in_fp32_modules: Optional[List[str]] = None,
-        device_map: Dict[str, Union[int, str, torch.device]] = None,
+        device_map: Union[str, int, torch.device, Dict[str, Union[int, str, torch.device]]] = None,
         offload_state_dict: Optional[bool] = None,
         offload_folder: Optional[Union[str, os.PathLike]] = None,
         dduf_entries: Optional[Dict[str, DDUFEntry]] = None,
 
@@ -1193,6 +1193,11 @@ def __call__(
             image = self.vae.decode(latents, return_dict=False)[0]
             image = self.image_processor.postprocess(image, output_type=output_type)
 
+            if padding_mask_crop is not None:
+                image = [
+                    self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image
+                ]
+
         # Offload all models
         self.maybe_free_model_hooks()
 
 
@@ -669,14 +669,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
                 guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
                 information.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
-                A map that specifies where each submodule should go. It doesn’t need to be defined for each
-                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
-                same device.
-
-                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
-                more information about each option see [designing a device
-                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            device_map (`str`, *optional*):
+                Strategy that dictates how the different components of a pipeline should be placed on available
+                devices. Currently, only "balanced" `device_map` is supported. Check out
+                [this](https://huggingface.co/docs/diffusers/main/en/tutorials/inference_with_big_models#device-placement)
+                to know more.
             max_memory (`Dict`, *optional*):
                 A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
                 each GPU and the available CPU RAM if unset.
 
@@ -419,12 +419,7 @@ def prepare_latents(
         )
 
         if latents is None:
-            if isinstance(generator, list):
-                init_latents = [
-                    retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
-                ]
-            else:
-                init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
+            init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), sample_mode="argmax") for vid in video]
 
             init_latents = torch.cat(init_latents, dim=0).to(dtype)
 
@@ -441,7 +436,7 @@ def prepare_latents(
             if hasattr(self.scheduler, "add_noise"):
                 latents = self.scheduler.add_noise(init_latents, noise, timestep)
             else:
-                latents = self.scheduelr.scale_noise(init_latents, timestep, noise)
+                latents = self.scheduler.scale_noise(init_latents, timestep, noise)
         else:
             latents = latents.to(device)
 
 
@@ -493,7 +493,7 @@ def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]]
         TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()
         if self.quant_type not in TORCHAO_QUANT_TYPE_METHODS.keys():
             is_floating_quant_type = self.quant_type.startswith("float") or self.quant_type.startswith("fp")
-            if is_floating_quant_type and not self._is_cuda_capability_atleast_8_9():
+            if is_floating_quant_type and not self._is_xpu_or_cuda_capability_atleast_8_9():
                 raise ValueError(
                     f"Requested quantization type: {self.quant_type} is not supported on GPUs with CUDA capability <= 8.9. You "
                     f"can check the CUDA capability of your GPU using `torch.cuda.get_device_capability()`."
@@ -645,7 +645,7 @@ def generate_fpx_quantization_types(bits: int):
             QUANTIZATION_TYPES.update(INT8_QUANTIZATION_TYPES)
             QUANTIZATION_TYPES.update(UINTX_QUANTIZATION_DTYPES)
 
-            if cls._is_cuda_capability_atleast_8_9():
+            if cls._is_xpu_or_cuda_capability_atleast_8_9():
                 QUANTIZATION_TYPES.update(FLOATX_QUANTIZATION_TYPES)
 
             return QUANTIZATION_TYPES
@@ -655,14 +655,16 @@ def generate_fpx_quantization_types(bits: int):
             )
 
     @staticmethod
-    def _is_cuda_capability_atleast_8_9() -> bool:
-        if not torch.cuda.is_available():
-            raise RuntimeError("TorchAO requires a CUDA compatible GPU and installation of PyTorch.")
-
-        major, minor = torch.cuda.get_device_capability()
-        if major == 8:
-            return minor >= 9
-        return major >= 9
+    def _is_xpu_or_cuda_capability_atleast_8_9() -> bool:
+        if torch.cuda.is_available():
+            major, minor = torch.cuda.get_device_capability()
+            if major == 8:
+                return minor >= 9
+            return major >= 9
+        elif torch.xpu.is_available():
+            return True
+        else:
+            raise RuntimeError("TorchAO requires a CUDA compatible GPU or Intel XPU and installation of PyTorch.")
 
     def get_apply_tensor_subclass(self):
         TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()
 
@@ -154,12 +154,30 @@ def check_imports(filename):
     return get_relative_imports(filename)
 
 
-def get_class_in_module(class_name, module_path):
+def get_class_in_module(class_name, module_path, pretrained_model_name_or_path=None):
     """
     Import a module on the cache directory for modules and extract a class from it.
     """
     module_path = module_path.replace(os.path.sep, ".")
-    module = importlib.import_module(module_path)
+    try:
+        module = importlib.import_module(module_path)
+    except ModuleNotFoundError as e:
+        # This can happen when the repo id contains ".", which Python's import machinery interprets as a directory
+        # separator. We do a bit of monkey patching to detect and fix this case.
+        if not (
+            pretrained_model_name_or_path is not None
+            and "." in pretrained_model_name_or_path
+            and module_path.startswith("diffusers_modules")
+            and pretrained_model_name_or_path.replace("/", "--") in module_path
+        ):
+            raise e  # We can't figure this one out, just reraise the original error
+
+        corrected_path = os.path.join(HF_MODULES_CACHE, module_path.replace(".", "/")) + ".py"
+        corrected_path = corrected_path.replace(
+            pretrained_model_name_or_path.replace("/", "--").replace(".", "/"),
+            pretrained_model_name_or_path.replace("/", "--"),
+        )
+        module = importlib.machinery.SourceFileLoader(module_path, corrected_path).load_module()
 
     if class_name is None:
         return find_pipeline_class(module)
@@ -454,4 +472,4 @@ def get_class_from_dynamic_module(
         revision=revision,
         local_files_only=local_files_only,
     )
-    return get_class_in_module(class_name, final_module.replace(".py", ""))
+    return get_class_in_module(class_name, final_module.replace(".py", ""), pretrained_model_name_or_path)
@@ -291,6 +291,18 @@ def decorator(test_case):
     return decorator
 
 
+def require_torch_version_greater(torch_version):
+    """Decorator marking a test that requires torch with a specific version greater."""
+
+    def decorator(test_case):
+        correct_torch_version = is_torch_available() and is_torch_version(">", torch_version)
+        return unittest.skipUnless(
+            correct_torch_version, f"test requires torch with the version greater than {torch_version}"
+        )(test_case)
+
+    return decorator
+
+
 def require_torch_gpu(test_case):
     """Decorator marking a test that requires CUDA and PyTorch."""
     return unittest.skipUnless(is_torch_available() and torch_device == "cuda", "test requires PyTorch+CUDA")(
@@ -300,9 +312,7 @@ def require_torch_gpu(test_case):
 
 def require_torch_cuda_compatibility(expected_compute_capability):
     def decorator(test_case):
-        if not torch.cuda.is_available():
-            return unittest.skip(test_case)
-        else:
+        if torch.cuda.is_available():
             current_compute_capability = get_torch_cuda_device_capability()
             return unittest.skipUnless(
                 float(current_compute_capability) == float(expected_compute_capability),
 
@@ -21,6 +21,7 @@
 
 from diffusers import ConsistencyDecoderVAE, StableDiffusionPipeline
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     enable_full_determinism,
     load_image,
     slow,
@@ -162,13 +163,13 @@ def setUp(self):
         # clean up the VRAM before each test
         super().setUp()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     @torch.no_grad()
     def test_encode_decode(self):
 
@@ -22,6 +22,7 @@
 from diffusers import UNet2DModel
 from diffusers.utils import logging
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
     enable_full_determinism,
     floats_tensor,
     require_torch_accelerator,
@@ -229,7 +230,7 @@ def test_from_pretrained_accelerate_wont_change_results(self):
 
         # two models don't need to stay in the device at the same time
         del model_accelerate
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
         gc.collect()
 
         model_normal_load, _ = UNet2DModel.from_pretrained(