Skip to content

Commit 7c3df5d

Browse files
authored
Merge branch 'main' into deprecate-slicing-tiling-pipe
2 parents 5d84141 + 55f0b3d commit 7c3df5d

File tree

16 files changed

+371
-30
lines changed

16 files changed

+371
-30
lines changed

docs/source/en/api/pipelines/cogvideox.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ from diffusers.utils import export_to_video
5050
pipeline_quant_config = PipelineQuantizationConfig(
5151
quant_backend="torchao",
5252
quant_kwargs={"quant_type": "int8wo"},
53-
components_to_quantize=["transformer"]
53+
components_to_quantize="transformer"
5454
)
5555

5656
# fp8 layerwise weight-casting

docs/source/en/api/pipelines/hunyuan_video.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
5454
"bnb_4bit_quant_type": "nf4",
5555
"bnb_4bit_compute_dtype": torch.bfloat16
5656
},
57-
components_to_quantize=["transformer"]
57+
components_to_quantize="transformer"
5858
)
5959

6060
pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -91,7 +91,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
9191
"bnb_4bit_quant_type": "nf4",
9292
"bnb_4bit_compute_dtype": torch.bfloat16
9393
},
94-
components_to_quantize=["transformer"]
94+
components_to_quantize="transformer"
9595
)
9696

9797
pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -139,7 +139,7 @@ export_to_video(video, "output.mp4", fps=15)
139139
"bnb_4bit_quant_type": "nf4",
140140
"bnb_4bit_compute_dtype": torch.bfloat16
141141
},
142-
components_to_quantize=["transformer"]
142+
components_to_quantize="transformer"
143143
)
144144

145145
pipeline = HunyuanVideoPipeline.from_pretrained(

docs/source/en/optimization/memory.md

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,13 +291,53 @@ Group offloading moves groups of internal layers ([torch.nn.ModuleList](https://
291291
> [!WARNING]
292292
> Group offloading may not work with all models if the forward implementation contains weight-dependent device casting of inputs because it may clash with group offloading's device casting mechanism.
293293
294-
Call [`~ModelMixin.enable_group_offload`] to enable it for standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
295-
296-
The `offload_type` parameter can be set to `block_level` or `leaf_level`.
294+
Enable group offloading by configuring the `offload_type` parameter to `block_level` or `leaf_level`.
297295

298296
- `block_level` offloads groups of layers based on the `num_blocks_per_group` parameter. For example, if `num_blocks_per_group=2` on a model with 40 layers, 2 layers are onloaded and offloaded at a time (20 total onloads/offloads). This drastically reduces memory requirements.
299297
- `leaf_level` offloads individual layers at the lowest level and is equivalent to [CPU offloading](#cpu-offloading). But it can be made faster if you use streams without giving up inference speed.
300298

299+
Group offloading is supported for entire pipelines or individual models. Applying group offloading to the entire pipeline is the easiest option while selectively applying it to individual models gives users more flexibility to use different offloading techniques for different models.
300+
301+
<hfoptions id="group-offloading">
302+
<hfoption id="pipeline">
303+
304+
Call [`~DiffusionPipeline.enable_group_offload`] on a pipeline.
305+
306+
```py
307+
import torch
308+
from diffusers import CogVideoXPipeline
309+
from diffusers.hooks import apply_group_offloading
310+
from diffusers.utils import export_to_video
311+
312+
onload_device = torch.device("cuda")
313+
offload_device = torch.device("cpu")
314+
315+
pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
316+
pipeline.enable_group_offload(
317+
onload_device=onload_device,
318+
offload_device=offload_device,
319+
offload_type="leaf_level",
320+
use_stream=True
321+
)
322+
323+
prompt = (
324+
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
325+
"The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
326+
"pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
327+
"casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
328+
"The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
329+
"atmosphere of this unique musical performance."
330+
)
331+
video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
332+
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
333+
export_to_video(video, "output.mp4", fps=8)
334+
```
335+
336+
</hfoption>
337+
<hfoption id="model">
338+
339+
Call [`~ModelMixin.enable_group_offload`] on standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
340+
301341
```py
302342
import torch
303343
from diffusers import CogVideoXPipeline
@@ -328,6 +368,9 @@ print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} G
328368
export_to_video(video, "output.mp4", fps=8)
329369
```
330370

371+
</hfoption>
372+
</hfoptions>
373+
331374
#### CUDA stream
332375

333376
The `use_stream` parameter can be activated for CUDA devices that support asynchronous data transfer streams to reduce overall execution time compared to [CPU offloading](#cpu-offloading). It overlaps data transfer and computation by using layer prefetching. The next layer to be executed is loaded onto the GPU while the current layer is still being executed. It can increase CPU memory significantly so ensure you have 2x the amount of memory as the model size.

docs/source/en/quantization/overview.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ Initialize [`~quantizers.PipelineQuantizationConfig`] with the following paramet
3434
> [!TIP]
3535
> These `quant_kwargs` arguments are different for each backend. Refer to the [Quantization API](../api/quantization) docs to view the arguments for each backend.
3636
37-
- `components_to_quantize` specifies which components of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
37+
- `components_to_quantize` specifies which component(s) of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
38+
39+
`components_to_quantize` accepts either a list for multiple models or a string for a single model.
3840

3941
The example below loads the bitsandbytes backend with the following arguments from [`~quantizers.quantization_config.BitsAndBytesConfig`], `load_in_4bit`, `bnb_4bit_quant_type`, and `bnb_4bit_compute_dtype`.
4042

@@ -62,6 +64,7 @@ pipe = DiffusionPipeline.from_pretrained(
6264
image = pipe("photo of a cute dog").images[0]
6365
```
6466

67+
6568
### Advanced quantization
6669

6770
The `quant_mapping` argument provides more options for how to quantize each individual component in a pipeline, like combining different quantization backends.

docs/source/en/using-diffusers/text-img2vid.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
9898
"bnb_4bit_quant_type": "nf4",
9999
"bnb_4bit_compute_dtype": torch.bfloat16
100100
},
101-
components_to_quantize=["transformer"]
101+
components_to_quantize="transformer"
102102
)
103103

104104
pipeline = HunyuanVideoPipeline.from_pretrained(

src/diffusers/loaders/single_file_model.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from typing_extensions import Self
2323

2424
from .. import __version__
25+
from ..models.model_loading_utils import _caching_allocator_warmup, _determine_device_map, _expand_device_map
2526
from ..quantizers import DiffusersAutoQuantizer
2627
from ..utils import deprecate, is_accelerate_available, is_torch_version, logging
2728
from ..utils.torch_utils import empty_device_cache
@@ -297,6 +298,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
297298
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
298299
device = kwargs.pop("device", None)
299300
disable_mmap = kwargs.pop("disable_mmap", False)
301+
device_map = kwargs.pop("device_map", None)
300302

301303
user_agent = {"diffusers": __version__, "file_type": "single_file", "framework": "pytorch"}
302304
# In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry`
@@ -403,19 +405,8 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
403405
with ctx():
404406
model = cls.from_config(diffusers_model_config)
405407

406-
checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs)
407-
408-
if _should_convert_state_dict_to_diffusers(model.state_dict(), checkpoint):
409-
diffusers_format_checkpoint = checkpoint_mapping_fn(
410-
config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
411-
)
412-
else:
413-
diffusers_format_checkpoint = checkpoint
408+
model_state_dict = model.state_dict()
414409

415-
if not diffusers_format_checkpoint:
416-
raise SingleFileComponentError(
417-
f"Failed to load {mapping_class_name}. Weights for this component appear to be missing in the checkpoint."
418-
)
419410
# Check if `_keep_in_fp32_modules` is not None
420411
use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
421412
(torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
@@ -428,6 +419,26 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
428419
else:
429420
keep_in_fp32_modules = []
430421

422+
# Now that the model is loaded, we can determine the `device_map`
423+
device_map = _determine_device_map(model, device_map, None, torch_dtype, keep_in_fp32_modules, hf_quantizer)
424+
if device_map is not None:
425+
expanded_device_map = _expand_device_map(device_map, model_state_dict.keys())
426+
_caching_allocator_warmup(model, expanded_device_map, torch_dtype, hf_quantizer)
427+
428+
checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs)
429+
430+
if _should_convert_state_dict_to_diffusers(model_state_dict, checkpoint):
431+
diffusers_format_checkpoint = checkpoint_mapping_fn(
432+
config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
433+
)
434+
else:
435+
diffusers_format_checkpoint = checkpoint
436+
437+
if not diffusers_format_checkpoint:
438+
raise SingleFileComponentError(
439+
f"Failed to load {mapping_class_name}. Weights for this component appear to be missing in the checkpoint."
440+
)
441+
431442
if hf_quantizer is not None:
432443
hf_quantizer.preprocess_model(
433444
model=model,

src/diffusers/modular_pipelines/flux/before_denoise.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,9 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
454454
block_state = self.get_block_state(state)
455455
block_state.device = components._execution_device
456456

457+
block_state.height = block_state.height or components.default_height
458+
block_state.width = block_state.width or components.default_width
459+
457460
scheduler = components.scheduler
458461
transformer = components.transformer
459462
batch_size = block_state.batch_size * block_state.num_images_per_prompt
@@ -659,8 +662,6 @@ def intermediate_outputs(self) -> List[OutputParam]:
659662
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
660663
block_state = self.get_block_state(state)
661664

662-
block_state.height = block_state.height or components.default_height
663-
block_state.width = block_state.width or components.default_width
664665
block_state.device = components._execution_device
665666
block_state.dtype = torch.bfloat16 # TODO: okay to hardcode this?
666667
block_state.num_channels_latents = components.num_channels_latents

src/diffusers/modular_pipelines/flux/modular_blocks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,8 @@ def description(self):
148148
[
149149
("text_encoder", FluxTextEncoderStep),
150150
("input", FluxInputStep),
151-
("set_timesteps", FluxSetTimestepsStep),
152151
("prepare_latents", FluxPrepareLatentsStep),
152+
("set_timesteps", FluxSetTimestepsStep),
153153
("denoise", FluxDenoiseStep),
154154
("decode", FluxDecodeStep),
155155
]

src/diffusers/pipelines/pipeline_utils.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1335,6 +1335,133 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
13351335
offload_buffers = len(model._parameters) > 0
13361336
cpu_offload(model, device, offload_buffers=offload_buffers)
13371337

1338+
def enable_group_offload(
1339+
self,
1340+
onload_device: torch.device,
1341+
offload_device: torch.device = torch.device("cpu"),
1342+
offload_type: str = "block_level",
1343+
num_blocks_per_group: Optional[int] = None,
1344+
non_blocking: bool = False,
1345+
use_stream: bool = False,
1346+
record_stream: bool = False,
1347+
low_cpu_mem_usage=False,
1348+
offload_to_disk_path: Optional[str] = None,
1349+
exclude_modules: Optional[Union[str, List[str]]] = None,
1350+
) -> None:
1351+
r"""
1352+
Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is,
1353+
and where it is beneficial, we need to first provide some context on how other supported offloading methods
1354+
work.
1355+
1356+
Typically, offloading is done at two levels:
1357+
- Module-level: In Diffusers, this can be enabled using the `ModelMixin::enable_model_cpu_offload()` method. It
1358+
works by offloading each component of a pipeline to the CPU for storage, and onloading to the accelerator
1359+
device when needed for computation. This method is more memory-efficient than keeping all components on the
1360+
accelerator, but the memory requirements are still quite high. For this method to work, one needs memory
1361+
equivalent to size of the model in runtime dtype + size of largest intermediate activation tensors to be able
1362+
to complete the forward pass.
1363+
- Leaf-level: In Diffusers, this can be enabled using the `ModelMixin::enable_sequential_cpu_offload()` method.
1364+
It
1365+
works by offloading the lowest leaf-level parameters of the computation graph to the CPU for storage, and
1366+
onloading only the leafs to the accelerator device for computation. This uses the lowest amount of accelerator
1367+
memory, but can be slower due to the excessive number of device synchronizations.
1368+
1369+
Group offloading is a middle ground between the two methods. It works by offloading groups of internal layers,
1370+
(either `torch.nn.ModuleList` or `torch.nn.Sequential`). This method uses lower memory than module-level
1371+
offloading. It is also faster than leaf-level/sequential offloading, as the number of device synchronizations
1372+
is reduced.
1373+
1374+
Another supported feature (for CUDA devices with support for asynchronous data transfer streams) is the ability
1375+
to overlap data transfer and computation to reduce the overall execution time compared to sequential
1376+
offloading. This is enabled using layer prefetching with streams, i.e., the layer that is to be executed next
1377+
starts onloading to the accelerator device while the current layer is being executed - this increases the
1378+
memory requirements slightly. Note that this implementation also supports leaf-level offloading but can be made
1379+
much faster when using streams.
1380+
1381+
Args:
1382+
onload_device (`torch.device`):
1383+
The device to which the group of modules are onloaded.
1384+
offload_device (`torch.device`, defaults to `torch.device("cpu")`):
1385+
The device to which the group of modules are offloaded. This should typically be the CPU. Default is
1386+
CPU.
1387+
offload_type (`str` or `GroupOffloadingType`, defaults to "block_level"):
1388+
The type of offloading to be applied. Can be one of "block_level" or "leaf_level". Default is
1389+
"block_level".
1390+
offload_to_disk_path (`str`, *optional*, defaults to `None`):
1391+
The path to the directory where parameters will be offloaded. Setting this option can be useful in
1392+
limited RAM environment settings where a reasonable speed-memory trade-off is desired.
1393+
num_blocks_per_group (`int`, *optional*):
1394+
The number of blocks per group when using offload_type="block_level". This is required when using
1395+
offload_type="block_level".
1396+
non_blocking (`bool`, defaults to `False`):
1397+
If True, offloading and onloading is done with non-blocking data transfer.
1398+
use_stream (`bool`, defaults to `False`):
1399+
If True, offloading and onloading is done asynchronously using a CUDA stream. This can be useful for
1400+
overlapping computation and data transfer.
1401+
record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
1402+
as having been used by this stream. It is faster at the expense of slightly more memory usage. Refer to
1403+
the [PyTorch official docs](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html)
1404+
more details.
1405+
low_cpu_mem_usage (`bool`, defaults to `False`):
1406+
If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them.
1407+
This option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be
1408+
useful when the CPU memory is a bottleneck but may counteract the benefits of using streams.
1409+
exclude_modules (`Union[str, List[str]]`, defaults to `None`): List of modules to exclude from offloading.
1410+
1411+
Example:
1412+
```python
1413+
>>> from diffusers import DiffusionPipeline
1414+
>>> import torch
1415+
1416+
>>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image", torch_dtype=torch.bfloat16)
1417+
1418+
>>> pipe.enable_group_offload(
1419+
... onload_device=torch.device("cuda"),
1420+
... offload_device=torch.device("cpu"),
1421+
... offload_type="leaf_level",
1422+
... use_stream=True,
1423+
... )
1424+
>>> image = pipe("a beautiful sunset").images[0]
1425+
```
1426+
"""
1427+
from ..hooks import apply_group_offloading
1428+
1429+
if isinstance(exclude_modules, str):
1430+
exclude_modules = [exclude_modules]
1431+
elif exclude_modules is None:
1432+
exclude_modules = []
1433+
1434+
unknown = set(exclude_modules) - self.components.keys()
1435+
if unknown:
1436+
logger.info(
1437+
f"The following modules are not present in pipeline: {', '.join(unknown)}. Ignore if this is expected."
1438+
)
1439+
1440+
group_offload_kwargs = {
1441+
"onload_device": onload_device,
1442+
"offload_device": offload_device,
1443+
"offload_type": offload_type,
1444+
"num_blocks_per_group": num_blocks_per_group,
1445+
"non_blocking": non_blocking,
1446+
"use_stream": use_stream,
1447+
"record_stream": record_stream,
1448+
"low_cpu_mem_usage": low_cpu_mem_usage,
1449+
"offload_to_disk_path": offload_to_disk_path,
1450+
}
1451+
for name, component in self.components.items():
1452+
if name not in exclude_modules and isinstance(component, torch.nn.Module):
1453+
if hasattr(component, "enable_group_offload"):
1454+
component.enable_group_offload(**group_offload_kwargs)
1455+
else:
1456+
apply_group_offloading(module=component, **group_offload_kwargs)
1457+
1458+
if exclude_modules:
1459+
for module_name in exclude_modules:
1460+
module = getattr(self, module_name, None)
1461+
if module is not None and isinstance(module, torch.nn.Module):
1462+
module.to(onload_device)
1463+
logger.debug(f"Placed `{module_name}` on {onload_device} device as it was in `exclude_modules`.")
1464+
13381465
def reset_device_map(self):
13391466
r"""
13401467
Resets the device maps (if any) to None.

0 commit comments

Comments
 (0)