Skip to content

Commit 9017a2c

Browse files
authored
Merge branch 'main' into benchmarking-overhaul
2 parents f9285fd + 9154566 commit 9017a2c

39 files changed

+433
-113
lines changed

src/diffusers/loaders/lora_pipeline.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,17 @@ def _maybe_dequantize_weight_for_expanded_lora(model, module):
8181
from ..quantizers.gguf.utils import dequantize_gguf_tensor
8282

8383
is_bnb_4bit_quantized = module.weight.__class__.__name__ == "Params4bit"
84+
is_bnb_8bit_quantized = module.weight.__class__.__name__ == "Int8Params"
8485
is_gguf_quantized = module.weight.__class__.__name__ == "GGUFParameter"
8586

8687
if is_bnb_4bit_quantized and not is_bitsandbytes_available():
8788
raise ValueError(
8889
"The checkpoint seems to have been quantized with `bitsandbytes` (4bits). Install `bitsandbytes` to load quantized checkpoints."
8990
)
91+
if is_bnb_8bit_quantized and not is_bitsandbytes_available():
92+
raise ValueError(
93+
"The checkpoint seems to have been quantized with `bitsandbytes` (8bits). Install `bitsandbytes` to load quantized checkpoints."
94+
)
9095
if is_gguf_quantized and not is_gguf_available():
9196
raise ValueError(
9297
"The checkpoint seems to have been quantized with `gguf`. Install `gguf` to load quantized checkpoints."
@@ -97,10 +102,10 @@ def _maybe_dequantize_weight_for_expanded_lora(model, module):
97102
weight_on_cpu = True
98103

99104
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
100-
if is_bnb_4bit_quantized:
105+
if is_bnb_4bit_quantized or is_bnb_8bit_quantized:
101106
module_weight = dequantize_bnb_weight(
102107
module.weight.to(device) if weight_on_cpu else module.weight,
103-
state=module.weight.quant_state,
108+
state=module.weight.quant_state if is_bnb_4bit_quantized else module.state,
104109
dtype=model.dtype,
105110
).data
106111
elif is_gguf_quantized:

src/diffusers/models/modeling_utils.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -814,14 +814,43 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
814814
Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
815815
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
816816
information.
817-
device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
817+
device_map (`Union[int, str, torch.device]` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
818818
A map that specifies where each submodule should go. It doesn't need to be defined for each
819819
parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
820820
same device. Defaults to `None`, meaning that the model will be loaded on CPU.
821821
822+
Examples:
823+
824+
```py
825+
>>> from diffusers import AutoModel
826+
>>> import torch
827+
828+
>>> # This works.
829+
>>> model = AutoModel.from_pretrained(
830+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="cuda"
831+
... )
832+
>>> # This also works (integer accelerator device ID).
833+
>>> model = AutoModel.from_pretrained(
834+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map=0
835+
... )
836+
>>> # Specifying a supported offloading strategy like "auto" also works.
837+
>>> model = AutoModel.from_pretrained(
838+
... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="auto"
839+
... )
840+
>>> # Specifying a dictionary as `device_map` also works.
841+
>>> model = AutoModel.from_pretrained(
842+
... "stabilityai/stable-diffusion-xl-base-1.0",
843+
... subfolder="unet",
844+
... device_map={"": torch.device("cuda")},
845+
... )
846+
```
847+
822848
Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
823849
more information about each option see [designing a device
824-
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
850+
map](https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap). You
851+
can also refer to the [Diffusers-specific
852+
documentation](https://huggingface.co/docs/diffusers/main/en/training/distributed_inference#model-sharding)
853+
for more concrete examples.
825854
max_memory (`Dict`, *optional*):
826855
A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
827856
each GPU and the available CPU RAM if unset.
@@ -1387,7 +1416,7 @@ def _load_pretrained_model(
13871416
low_cpu_mem_usage: bool = True,
13881417
dtype: Optional[Union[str, torch.dtype]] = None,
13891418
keep_in_fp32_modules: Optional[List[str]] = None,
1390-
device_map: Dict[str, Union[int, str, torch.device]] = None,
1419+
device_map: Union[str, int, torch.device, Dict[str, Union[int, str, torch.device]]] = None,
13911420
offload_state_dict: Optional[bool] = None,
13921421
offload_folder: Optional[Union[str, os.PathLike]] = None,
13931422
dduf_entries: Optional[Dict[str, DDUFEntry]] = None,

src/diffusers/pipelines/flux/pipeline_flux_inpaint.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,6 +1193,11 @@ def __call__(
11931193
image = self.vae.decode(latents, return_dict=False)[0]
11941194
image = self.image_processor.postprocess(image, output_type=output_type)
11951195

1196+
if padding_mask_crop is not None:
1197+
image = [
1198+
self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image
1199+
]
1200+
11961201
# Offload all models
11971202
self.maybe_free_model_hooks()
11981203

src/diffusers/pipelines/pipeline_utils.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -669,14 +669,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
669669
Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
670670
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
671671
information.
672-
device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
673-
A map that specifies where each submodule should go. It doesn’t need to be defined for each
674-
parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
675-
same device.
676-
677-
Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
678-
more information about each option see [designing a device
679-
map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
672+
device_map (`str`, *optional*):
673+
Strategy that dictates how the different components of a pipeline should be placed on available
674+
devices. Currently, only "balanced" `device_map` is supported. Check out
675+
[this](https://huggingface.co/docs/diffusers/main/en/tutorials/inference_with_big_models#device-placement)
676+
to know more.
680677
max_memory (`Dict`, *optional*):
681678
A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
682679
each GPU and the available CPU RAM if unset.

src/diffusers/pipelines/wan/pipeline_wan_video2video.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -419,12 +419,7 @@ def prepare_latents(
419419
)
420420

421421
if latents is None:
422-
if isinstance(generator, list):
423-
init_latents = [
424-
retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
425-
]
426-
else:
427-
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
422+
init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), sample_mode="argmax") for vid in video]
428423

429424
init_latents = torch.cat(init_latents, dim=0).to(dtype)
430425

@@ -441,7 +436,7 @@ def prepare_latents(
441436
if hasattr(self.scheduler, "add_noise"):
442437
latents = self.scheduler.add_noise(init_latents, noise, timestep)
443438
else:
444-
latents = self.scheduelr.scale_noise(init_latents, timestep, noise)
439+
latents = self.scheduler.scale_noise(init_latents, timestep, noise)
445440
else:
446441
latents = latents.to(device)
447442

src/diffusers/quantizers/quantization_config.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,7 @@ def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]]
493493
TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()
494494
if self.quant_type not in TORCHAO_QUANT_TYPE_METHODS.keys():
495495
is_floating_quant_type = self.quant_type.startswith("float") or self.quant_type.startswith("fp")
496-
if is_floating_quant_type and not self._is_cuda_capability_atleast_8_9():
496+
if is_floating_quant_type and not self._is_xpu_or_cuda_capability_atleast_8_9():
497497
raise ValueError(
498498
f"Requested quantization type: {self.quant_type} is not supported on GPUs with CUDA capability <= 8.9. You "
499499
f"can check the CUDA capability of your GPU using `torch.cuda.get_device_capability()`."
@@ -645,7 +645,7 @@ def generate_fpx_quantization_types(bits: int):
645645
QUANTIZATION_TYPES.update(INT8_QUANTIZATION_TYPES)
646646
QUANTIZATION_TYPES.update(UINTX_QUANTIZATION_DTYPES)
647647

648-
if cls._is_cuda_capability_atleast_8_9():
648+
if cls._is_xpu_or_cuda_capability_atleast_8_9():
649649
QUANTIZATION_TYPES.update(FLOATX_QUANTIZATION_TYPES)
650650

651651
return QUANTIZATION_TYPES
@@ -655,14 +655,16 @@ def generate_fpx_quantization_types(bits: int):
655655
)
656656

657657
@staticmethod
658-
def _is_cuda_capability_atleast_8_9() -> bool:
659-
if not torch.cuda.is_available():
660-
raise RuntimeError("TorchAO requires a CUDA compatible GPU and installation of PyTorch.")
661-
662-
major, minor = torch.cuda.get_device_capability()
663-
if major == 8:
664-
return minor >= 9
665-
return major >= 9
658+
def _is_xpu_or_cuda_capability_atleast_8_9() -> bool:
659+
if torch.cuda.is_available():
660+
major, minor = torch.cuda.get_device_capability()
661+
if major == 8:
662+
return minor >= 9
663+
return major >= 9
664+
elif torch.xpu.is_available():
665+
return True
666+
else:
667+
raise RuntimeError("TorchAO requires a CUDA compatible GPU or Intel XPU and installation of PyTorch.")
666668

667669
def get_apply_tensor_subclass(self):
668670
TORCHAO_QUANT_TYPE_METHODS = self._get_torchao_quant_type_to_method()

src/diffusers/utils/dynamic_modules_utils.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,30 @@ def check_imports(filename):
154154
return get_relative_imports(filename)
155155

156156

157-
def get_class_in_module(class_name, module_path):
157+
def get_class_in_module(class_name, module_path, pretrained_model_name_or_path=None):
158158
"""
159159
Import a module on the cache directory for modules and extract a class from it.
160160
"""
161161
module_path = module_path.replace(os.path.sep, ".")
162-
module = importlib.import_module(module_path)
162+
try:
163+
module = importlib.import_module(module_path)
164+
except ModuleNotFoundError as e:
165+
# This can happen when the repo id contains ".", which Python's import machinery interprets as a directory
166+
# separator. We do a bit of monkey patching to detect and fix this case.
167+
if not (
168+
pretrained_model_name_or_path is not None
169+
and "." in pretrained_model_name_or_path
170+
and module_path.startswith("diffusers_modules")
171+
and pretrained_model_name_or_path.replace("/", "--") in module_path
172+
):
173+
raise e # We can't figure this one out, just reraise the original error
174+
175+
corrected_path = os.path.join(HF_MODULES_CACHE, module_path.replace(".", "/")) + ".py"
176+
corrected_path = corrected_path.replace(
177+
pretrained_model_name_or_path.replace("/", "--").replace(".", "/"),
178+
pretrained_model_name_or_path.replace("/", "--"),
179+
)
180+
module = importlib.machinery.SourceFileLoader(module_path, corrected_path).load_module()
163181

164182
if class_name is None:
165183
return find_pipeline_class(module)
@@ -454,4 +472,4 @@ def get_class_from_dynamic_module(
454472
revision=revision,
455473
local_files_only=local_files_only,
456474
)
457-
return get_class_in_module(class_name, final_module.replace(".py", ""))
475+
return get_class_in_module(class_name, final_module.replace(".py", ""), pretrained_model_name_or_path)

src/diffusers/utils/testing_utils.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,18 @@ def decorator(test_case):
291291
return decorator
292292

293293

294+
def require_torch_version_greater(torch_version):
295+
"""Decorator marking a test that requires torch with a specific version greater."""
296+
297+
def decorator(test_case):
298+
correct_torch_version = is_torch_available() and is_torch_version(">", torch_version)
299+
return unittest.skipUnless(
300+
correct_torch_version, f"test requires torch with the version greater than {torch_version}"
301+
)(test_case)
302+
303+
return decorator
304+
305+
294306
def require_torch_gpu(test_case):
295307
"""Decorator marking a test that requires CUDA and PyTorch."""
296308
return unittest.skipUnless(is_torch_available() and torch_device == "cuda", "test requires PyTorch+CUDA")(
@@ -300,9 +312,7 @@ def require_torch_gpu(test_case):
300312

301313
def require_torch_cuda_compatibility(expected_compute_capability):
302314
def decorator(test_case):
303-
if not torch.cuda.is_available():
304-
return unittest.skip(test_case)
305-
else:
315+
if torch.cuda.is_available():
306316
current_compute_capability = get_torch_cuda_device_capability()
307317
return unittest.skipUnless(
308318
float(current_compute_capability) == float(expected_compute_capability),

tests/models/autoencoders/test_models_consistency_decoder_vae.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from diffusers import ConsistencyDecoderVAE, StableDiffusionPipeline
2323
from diffusers.utils.testing_utils import (
24+
backend_empty_cache,
2425
enable_full_determinism,
2526
load_image,
2627
slow,
@@ -162,13 +163,13 @@ def setUp(self):
162163
# clean up the VRAM before each test
163164
super().setUp()
164165
gc.collect()
165-
torch.cuda.empty_cache()
166+
backend_empty_cache(torch_device)
166167

167168
def tearDown(self):
168169
# clean up the VRAM after each test
169170
super().tearDown()
170171
gc.collect()
171-
torch.cuda.empty_cache()
172+
backend_empty_cache(torch_device)
172173

173174
@torch.no_grad()
174175
def test_encode_decode(self):

tests/models/unets/test_models_unet_2d.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from diffusers import UNet2DModel
2323
from diffusers.utils import logging
2424
from diffusers.utils.testing_utils import (
25+
backend_empty_cache,
2526
enable_full_determinism,
2627
floats_tensor,
2728
require_torch_accelerator,
@@ -229,7 +230,7 @@ def test_from_pretrained_accelerate_wont_change_results(self):
229230

230231
# two models don't need to stay in the device at the same time
231232
del model_accelerate
232-
torch.cuda.empty_cache()
233+
backend_empty_cache(torch_device)
233234
gc.collect()
234235

235236
model_normal_load, _ = UNet2DModel.from_pretrained(

0 commit comments

Comments
 (0)