Skip to content

Commit b1e752a

Browse files
authored
Merge branch 'main' into flux-quantized-w-lora
2 parents ffbc7c0 + 56f7400 commit b1e752a

File tree

12 files changed

+155
-66
lines changed

12 files changed

+155
-66
lines changed

.github/workflows/pr_tests_gpu.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ jobs:
177177

178178
torch_cuda_tests:
179179
name: Torch CUDA Tests
180+
needs: [check_code_quality, check_repository_consistency]
180181
runs-on:
181182
group: aws-g4dn-2xlarge
182183
container:
@@ -245,7 +246,7 @@ jobs:
245246

246247
run_examples_tests:
247248
name: Examples PyTorch CUDA tests on Ubuntu
248-
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
249+
needs: [check_code_quality, check_repository_consistency]
249250
runs-on:
250251
group: aws-g4dn-2xlarge
251252

@@ -264,6 +265,7 @@ jobs:
264265
- name: Install dependencies
265266
run: |
266267
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
268+
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
267269
python -m uv pip install -e [quality,test,training]
268270
269271
- name: Environment

examples/research_projects/autoencoderkl/train_autoencoderkl.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,7 @@ def main(args):
627627
ema_vae = EMAModel(vae.parameters(), model_cls=AutoencoderKL, model_config=vae.config)
628628
perceptual_loss = lpips.LPIPS(net="vgg").eval()
629629
discriminator = NLayerDiscriminator(input_nc=3, n_layers=3, use_actnorm=False).apply(weights_init)
630+
discriminator = torch.nn.SyncBatchNorm.convert_sync_batchnorm(discriminator)
630631

631632
# Taken from [Sayak Paul's Diffusers PR #6511](https://github.com/huggingface/diffusers/pull/6511/files)
632633
def unwrap_model(model):
@@ -951,13 +952,20 @@ def load_model_hook(models, input_dir):
951952
logits_fake = discriminator(reconstructions)
952953
disc_loss = hinge_d_loss if args.disc_loss == "hinge" else vanilla_d_loss
953954
disc_factor = args.disc_factor if global_step >= args.disc_start else 0.0
954-
disc_loss = disc_factor * disc_loss(logits_real, logits_fake)
955+
d_loss = disc_factor * disc_loss(logits_real, logits_fake)
955956
logs = {
956-
"disc_loss": disc_loss.detach().mean().item(),
957+
"disc_loss": d_loss.detach().mean().item(),
957958
"logits_real": logits_real.detach().mean().item(),
958959
"logits_fake": logits_fake.detach().mean().item(),
959960
"disc_lr": disc_lr_scheduler.get_last_lr()[0],
960961
}
962+
accelerator.backward(d_loss)
963+
if accelerator.sync_gradients:
964+
params_to_clip = discriminator.parameters()
965+
accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
966+
disc_optimizer.step()
967+
disc_lr_scheduler.step()
968+
disc_optimizer.zero_grad(set_to_none=args.set_grads_to_none)
961969
# Checks if the accelerator has performed an optimization step behind the scenes
962970
if accelerator.sync_gradients:
963971
progress_bar.update(1)

src/diffusers/hooks/group_offloading.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,13 @@ def __init__(self):
181181
self._layer_execution_tracker_module_names = set()
182182

183183
def initialize_hook(self, module):
184+
def make_execution_order_update_callback(current_name, current_submodule):
185+
def callback():
186+
logger.debug(f"Adding {current_name} to the execution order")
187+
self.execution_order.append((current_name, current_submodule))
188+
189+
return callback
190+
184191
# To every submodule that contains a group offloading hook (at this point, no prefetching is enabled for any
185192
# of the groups), we add a layer execution tracker hook that will be used to determine the order in which the
186193
# layers are executed during the forward pass.
@@ -192,14 +199,8 @@ def initialize_hook(self, module):
192199
group_offloading_hook = registry.get_hook(_GROUP_OFFLOADING)
193200

194201
if group_offloading_hook is not None:
195-
196-
def make_execution_order_update_callback(current_name, current_submodule):
197-
def callback():
198-
logger.debug(f"Adding {current_name} to the execution order")
199-
self.execution_order.append((current_name, current_submodule))
200-
201-
return callback
202-
202+
# For the first forward pass, we have to load in a blocking manner
203+
group_offloading_hook.group.non_blocking = False
203204
layer_tracker_hook = LayerExecutionTrackerHook(make_execution_order_update_callback(name, submodule))
204205
registry.register_hook(layer_tracker_hook, _LAYER_EXECUTION_TRACKER)
205206
self._layer_execution_tracker_module_names.add(name)
@@ -229,15 +230,21 @@ def post_forward(self, module, output):
229230
# Remove the layer execution tracker hooks from the submodules
230231
base_module_registry = module._diffusers_hook
231232
registries = [submodule._diffusers_hook for _, submodule in self.execution_order]
233+
group_offloading_hooks = [registry.get_hook(_GROUP_OFFLOADING) for registry in registries]
232234

233235
for i in range(num_executed):
234236
registries[i].remove_hook(_LAYER_EXECUTION_TRACKER, recurse=False)
235237

236238
# Remove the current lazy prefetch group offloading hook so that it doesn't interfere with the next forward pass
237239
base_module_registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=False)
238240

239-
# Apply lazy prefetching by setting required attributes
240-
group_offloading_hooks = [registry.get_hook(_GROUP_OFFLOADING) for registry in registries]
241+
# LazyPrefetchGroupOffloadingHook is only used with streams, so we know that non_blocking should be True.
242+
# We disable non_blocking for the first forward pass, but need to enable it for the subsequent passes to
243+
# see the benefits of prefetching.
244+
for hook in group_offloading_hooks:
245+
hook.group.non_blocking = True
246+
247+
# Set required attributes for prefetching
241248
if num_executed > 0:
242249
base_module_group_offloading_hook = base_module_registry.get_hook(_GROUP_OFFLOADING)
243250
base_module_group_offloading_hook.next_group = group_offloading_hooks[0].group

src/diffusers/loaders/lora_pipeline.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4259,7 +4259,33 @@ def lora_state_dict(
42594259

42604260
return state_dict
42614261

4262-
# Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.load_lora_weights
4262+
@classmethod
4263+
def _maybe_expand_t2v_lora_for_i2v(
4264+
cls,
4265+
transformer: torch.nn.Module,
4266+
state_dict,
4267+
):
4268+
if transformer.config.image_dim is None:
4269+
return state_dict
4270+
4271+
if any(k.startswith("transformer.blocks.") for k in state_dict):
4272+
num_blocks = len({k.split("blocks.")[1].split(".")[0] for k in state_dict})
4273+
is_i2v_lora = any("add_k_proj" in k for k in state_dict) and any("add_v_proj" in k for k in state_dict)
4274+
4275+
if is_i2v_lora:
4276+
return state_dict
4277+
4278+
for i in range(num_blocks):
4279+
for o, c in zip(["k_img", "v_img"], ["add_k_proj", "add_v_proj"]):
4280+
state_dict[f"transformer.blocks.{i}.attn2.{c}.lora_A.weight"] = torch.zeros_like(
4281+
state_dict[f"transformer.blocks.{i}.attn2.to_k.lora_A.weight"]
4282+
)
4283+
state_dict[f"transformer.blocks.{i}.attn2.{c}.lora_B.weight"] = torch.zeros_like(
4284+
state_dict[f"transformer.blocks.{i}.attn2.to_k.lora_B.weight"]
4285+
)
4286+
4287+
return state_dict
4288+
42634289
def load_lora_weights(
42644290
self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
42654291
):
@@ -4297,7 +4323,11 @@ def load_lora_weights(
42974323

42984324
# First, ensure that the checkpoint is a compatible one and can be successfully loaded.
42994325
state_dict = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
4300-
4326+
# convert T2V LoRA to I2V LoRA (when loaded to Wan I2V) by adding zeros for the additional (missing) _img layers
4327+
state_dict = self._maybe_expand_t2v_lora_for_i2v(
4328+
transformer=getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer,
4329+
state_dict=state_dict,
4330+
)
43014331
is_correct_format = all("lora" in key for key in state_dict.keys())
43024332
if not is_correct_format:
43034333
raise ValueError("Invalid LoRA checkpoint.")

src/diffusers/models/resnet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ def forward(self, input_tensor: torch.Tensor, temb: torch.Tensor, *args, **kwarg
366366
hidden_states = self.conv2(hidden_states)
367367

368368
if self.conv_shortcut is not None:
369-
input_tensor = self.conv_shortcut(input_tensor)
369+
input_tensor = self.conv_shortcut(input_tensor.contiguous())
370370

371371
output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
372372

src/diffusers/pipelines/pipeline_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ def module_is_offloaded(module):
427427
"It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
428428
)
429429

430-
if device_type == "cuda":
430+
if device_type in ["cuda", "xpu"]:
431431
if pipeline_is_sequentially_offloaded and not pipeline_has_bnb:
432432
raise ValueError(
433433
"It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
@@ -440,7 +440,7 @@ def module_is_offloaded(module):
440440

441441
# Display a warning in this case (the operation succeeds but the benefits are lost)
442442
pipeline_is_offloaded = any(module_is_offloaded(module) for _, module in self.components.items())
443-
if pipeline_is_offloaded and device_type == "cuda":
443+
if pipeline_is_offloaded and device_type in ["cuda", "xpu"]:
444444
logger.warning(
445445
f"It seems like you have activated model offloading by calling `enable_model_cpu_offload`, but are now manually moving the pipeline to GPU. It is strongly recommended against doing so as memory gains from offloading are likely to be lost. Offloading automatically takes care of moving the individual components {', '.join(self.components.keys())} to GPU when needed. To make sure offloading works as expected, you should consider moving the pipeline back to CPU: `pipeline.to('cpu')` or removing the move altogether if you use offloading."
446446
)

src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def __init__(self, quantization_config, **kwargs):
6161
self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
6262

6363
def validate_environment(self, *args, **kwargs):
64-
if not torch.cuda.is_available():
64+
if not (torch.cuda.is_available() or torch.xpu.is_available()):
6565
raise RuntimeError("No GPU found. A GPU is needed for quantization.")
6666
if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"):
6767
raise ImportError(
@@ -238,11 +238,15 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
238238

239239
def update_device_map(self, device_map):
240240
if device_map is None:
241-
device_map = {"": f"cuda:{torch.cuda.current_device()}"}
241+
if torch.xpu.is_available():
242+
current_device = f"xpu:{torch.xpu.current_device()}"
243+
else:
244+
current_device = f"cuda:{torch.cuda.current_device()}"
245+
device_map = {"": current_device}
242246
logger.info(
243247
"The device_map was not initialized. "
244248
"Setting device_map to {"
245-
": f`cuda:{torch.cuda.current_device()}`}. "
249+
": {current_device}}. "
246250
"If you want to use the model for inference, please set device_map ='auto' "
247251
)
248252
return device_map
@@ -312,7 +316,10 @@ def _dequantize(self, model):
312316
logger.info(
313317
"Model was found to be on CPU (could happen as a result of `enable_model_cpu_offload()`). So, moving it to GPU. After dequantization, will move the model back to CPU again to preserve the previous device."
314318
)
315-
model.to(torch.cuda.current_device())
319+
if torch.xpu.is_available():
320+
model.to(torch.xpu.current_device())
321+
else:
322+
model.to(torch.cuda.current_device())
316323

317324
model = dequantize_and_replace(
318325
model, self.modules_to_not_convert, quantization_config=self.quantization_config
@@ -343,7 +350,7 @@ def __init__(self, quantization_config, **kwargs):
343350
self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
344351

345352
def validate_environment(self, *args, **kwargs):
346-
if not torch.cuda.is_available():
353+
if not (torch.cuda.is_available() or torch.xpu.is_available()):
347354
raise RuntimeError("No GPU found. A GPU is needed for quantization.")
348355
if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"):
349356
raise ImportError(
@@ -402,11 +409,15 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
402409
# Copied from diffusers.quantizers.bitsandbytes.bnb_quantizer.BnB4BitDiffusersQuantizer.update_device_map
403410
def update_device_map(self, device_map):
404411
if device_map is None:
405-
device_map = {"": f"cuda:{torch.cuda.current_device()}"}
412+
if torch.xpu.is_available():
413+
current_device = f"xpu:{torch.xpu.current_device()}"
414+
else:
415+
current_device = f"cuda:{torch.cuda.current_device()}"
416+
device_map = {"": current_device}
406417
logger.info(
407418
"The device_map was not initialized. "
408419
"Setting device_map to {"
409-
": f`cuda:{torch.cuda.current_device()}`}. "
420+
": {current_device}}. "
410421
"If you want to use the model for inference, please set device_map ='auto' "
411422
)
412423
return device_map

src/diffusers/utils/export_utils.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import struct
44
import tempfile
55
from contextlib import contextmanager
6-
from typing import List, Union
6+
from typing import List, Optional, Union
77

88
import numpy as np
99
import PIL.Image
@@ -139,8 +139,31 @@ def _legacy_export_to_video(
139139

140140

141141
def export_to_video(
142-
video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 10
142+
video_frames: Union[List[np.ndarray], List[PIL.Image.Image]],
143+
output_video_path: str = None,
144+
fps: int = 10,
145+
quality: float = 5.0,
146+
bitrate: Optional[int] = None,
147+
macro_block_size: Optional[int] = 16,
143148
) -> str:
149+
"""
150+
quality:
151+
Video output quality. Default is 5. Uses variable bit rate. Highest quality is 10, lowest is 0. Set to None to
152+
prevent variable bitrate flags to FFMPEG so you can manually specify them using output_params instead.
153+
Specifying a fixed bitrate using `bitrate` disables this parameter.
154+
155+
bitrate:
156+
Set a constant bitrate for the video encoding. Default is None causing `quality` parameter to be used instead.
157+
Better quality videos with smaller file sizes will result from using the `quality` variable bitrate parameter
158+
rather than specifiying a fixed bitrate with this parameter.
159+
160+
macro_block_size:
161+
Size constraint for video. Width and height, must be divisible by this number. If not divisible by this number
162+
imageio will tell ffmpeg to scale the image up to the next closest size divisible by this number. Most codecs
163+
are compatible with a macroblock size of 16 (default), some can go smaller (4, 8). To disable this automatic
164+
feature set it to None or 1, however be warned many players can't decode videos that are odd in size and some
165+
codecs will produce poor results or fail. See https://en.wikipedia.org/wiki/Macroblock.
166+
"""
144167
# TODO: Dhruv. Remove by Diffusers release 0.33.0
145168
# Added to prevent breaking existing code
146169
if not is_imageio_available():
@@ -177,7 +200,9 @@ def export_to_video(
177200
elif isinstance(video_frames[0], PIL.Image.Image):
178201
video_frames = [np.array(frame) for frame in video_frames]
179202

180-
with imageio.get_writer(output_video_path, fps=fps) as writer:
203+
with imageio.get_writer(
204+
output_video_path, fps=fps, quality=quality, bitrate=bitrate, macro_block_size=macro_block_size
205+
) as writer:
181206
for frame in video_frames:
182207
writer.append_data(frame)
183208

src/diffusers/utils/testing_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -574,10 +574,10 @@ def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -
574574
return arry
575575

576576

577-
def load_pt(url: str):
577+
def load_pt(url: str, map_location: str):
578578
response = requests.get(url)
579579
response.raise_for_status()
580-
arry = torch.load(BytesIO(response.content))
580+
arry = torch.load(BytesIO(response.content), map_location=map_location)
581581
return arry
582582

583583

tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,10 @@ def test_text_to_image_face_id(self):
377377
pipeline.set_ip_adapter_scale(0.7)
378378

379379
inputs = self.get_dummy_inputs()
380-
id_embeds = load_pt("https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/ai_face2.ipadpt")[
381-
0
382-
]
380+
id_embeds = load_pt(
381+
"https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/ai_face2.ipadpt",
382+
map_location=torch_device,
383+
)[0]
383384
id_embeds = id_embeds.reshape((2, 1, 1, 512))
384385
inputs["ip_adapter_image_embeds"] = [id_embeds]
385386
inputs["ip_adapter_image"] = None

0 commit comments

Comments
 (0)