Skip to content

Commit 84d2c84

Browse files
authored
Merge branch 'main' into to-single-file/flux
2 parents 251bb61 + 9c13f86 commit 84d2c84

19 files changed

+306
-159
lines changed

.github/workflows/pr_tests_gpu.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ on:
1313
- "src/diffusers/loaders/peft.py"
1414
- "tests/pipelines/test_pipelines_common.py"
1515
- "tests/models/test_modeling_common.py"
16+
- "examples/**/*.py"
1617
workflow_dispatch:
1718

1819
concurrency:

examples/dreambooth/train_dreambooth_lora_hidream.py

Lines changed: 26 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
compute_density_for_timestep_sampling,
5959
compute_loss_weighting_for_sd3,
6060
free_memory,
61+
offload_models,
6162
)
6263
from diffusers.utils import (
6364
check_min_version,
@@ -1364,43 +1365,34 @@ def compute_text_embeddings(prompt, text_encoding_pipeline):
13641365
# provided (i.e. the --instance_prompt is used for all images), we encode the instance prompt once to avoid
13651366
# the redundant encoding.
13661367
if not train_dataset.custom_instance_prompts:
1367-
if args.offload:
1368-
text_encoding_pipeline = text_encoding_pipeline.to(accelerator.device)
1369-
(
1370-
instance_prompt_hidden_states_t5,
1371-
instance_prompt_hidden_states_llama3,
1372-
instance_pooled_prompt_embeds,
1373-
_,
1374-
_,
1375-
_,
1376-
) = compute_text_embeddings(args.instance_prompt, text_encoding_pipeline)
1377-
if args.offload:
1378-
text_encoding_pipeline = text_encoding_pipeline.to("cpu")
1368+
with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
1369+
(
1370+
instance_prompt_hidden_states_t5,
1371+
instance_prompt_hidden_states_llama3,
1372+
instance_pooled_prompt_embeds,
1373+
_,
1374+
_,
1375+
_,
1376+
) = compute_text_embeddings(args.instance_prompt, text_encoding_pipeline)
13791377

13801378
# Handle class prompt for prior-preservation.
13811379
if args.with_prior_preservation:
1382-
if args.offload:
1383-
text_encoding_pipeline = text_encoding_pipeline.to(accelerator.device)
1384-
(class_prompt_hidden_states_t5, class_prompt_hidden_states_llama3, class_pooled_prompt_embeds, _, _, _) = (
1385-
compute_text_embeddings(args.class_prompt, text_encoding_pipeline)
1386-
)
1387-
if args.offload:
1388-
text_encoding_pipeline = text_encoding_pipeline.to("cpu")
1380+
with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
1381+
(class_prompt_hidden_states_t5, class_prompt_hidden_states_llama3, class_pooled_prompt_embeds, _, _, _) = (
1382+
compute_text_embeddings(args.class_prompt, text_encoding_pipeline)
1383+
)
13891384

13901385
validation_embeddings = {}
13911386
if args.validation_prompt is not None:
1392-
if args.offload:
1393-
text_encoding_pipeline = text_encoding_pipeline.to(accelerator.device)
1394-
(
1395-
validation_embeddings["prompt_embeds_t5"],
1396-
validation_embeddings["prompt_embeds_llama3"],
1397-
validation_embeddings["pooled_prompt_embeds"],
1398-
validation_embeddings["negative_prompt_embeds_t5"],
1399-
validation_embeddings["negative_prompt_embeds_llama3"],
1400-
validation_embeddings["negative_pooled_prompt_embeds"],
1401-
) = compute_text_embeddings(args.validation_prompt, text_encoding_pipeline)
1402-
if args.offload:
1403-
text_encoding_pipeline = text_encoding_pipeline.to("cpu")
1387+
with offload_models(text_encoding_pipeline, device=accelerator.device, offload=args.offload):
1388+
(
1389+
validation_embeddings["prompt_embeds_t5"],
1390+
validation_embeddings["prompt_embeds_llama3"],
1391+
validation_embeddings["pooled_prompt_embeds"],
1392+
validation_embeddings["negative_prompt_embeds_t5"],
1393+
validation_embeddings["negative_prompt_embeds_llama3"],
1394+
validation_embeddings["negative_pooled_prompt_embeds"],
1395+
) = compute_text_embeddings(args.validation_prompt, text_encoding_pipeline)
14041396

14051397
# If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
14061398
# pack the statically computed variables appropriately here. This is so that we don't
@@ -1581,12 +1573,10 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
15811573
if args.cache_latents:
15821574
model_input = latents_cache[step].sample()
15831575
else:
1584-
if args.offload:
1585-
vae = vae.to(accelerator.device)
1586-
pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
1576+
with offload_models(vae, device=accelerator.device, offload=args.offload):
1577+
pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
15871578
model_input = vae.encode(pixel_values).latent_dist.sample()
1588-
if args.offload:
1589-
vae = vae.to("cpu")
1579+
15901580
model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
15911581
model_input = model_input.to(dtype=weight_dtype)
15921582

src/diffusers/configuration_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -763,4 +763,7 @@ def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_un
763763
# resolve remapping
764764
remapped_class = _fetch_remapped_cls_from_config(config, cls)
765765

766-
return remapped_class.from_config(config, return_unused_kwargs, **kwargs)
766+
if remapped_class is cls:
767+
return super(LegacyConfigMixin, remapped_class).from_config(config, return_unused_kwargs, **kwargs)
768+
else:
769+
return remapped_class.from_config(config, return_unused_kwargs, **kwargs)

src/diffusers/loaders/single_file_model.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from .. import __version__
2525
from ..quantizers import DiffusersAutoQuantizer
2626
from ..utils import deprecate, is_accelerate_available, logging
27-
from ..utils.torch_utils import device_synchronize, empty_device_cache
27+
from ..utils.torch_utils import empty_device_cache
2828
from .single_file_utils import (
2929
SingleFileComponentError,
3030
convert_animatediff_checkpoint_to_diffusers,
@@ -431,10 +431,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
431431
keep_in_fp32_modules=keep_in_fp32_modules,
432432
unexpected_keys=unexpected_keys,
433433
)
434-
# Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
435-
# required because we move tensors with non_blocking=True, which is slightly faster for model loading.
436434
empty_device_cache()
437-
device_synchronize()
438435
else:
439436
_, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
440437

src/diffusers/loaders/single_file_utils.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
)
4747
from ..utils.constants import DIFFUSERS_REQUEST_TIMEOUT
4848
from ..utils.hub_utils import _get_model_file
49-
from ..utils.torch_utils import device_synchronize, empty_device_cache
49+
from ..utils.torch_utils import empty_device_cache
5050

5151

5252
if is_transformers_available():
@@ -1690,10 +1690,7 @@ def create_diffusers_clip_model_from_ldm(
16901690

16911691
if is_accelerate_available():
16921692
load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
1693-
# Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
1694-
# required because we move tensors with non_blocking=True, which is slightly faster for model loading.
16951693
empty_device_cache()
1696-
device_synchronize()
16971694
else:
16981695
model.load_state_dict(diffusers_format_checkpoint, strict=False)
16991696

@@ -2153,10 +2150,7 @@ def create_diffusers_t5_model_from_checkpoint(
21532150

21542151
if is_accelerate_available():
21552152
load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
2156-
# Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
2157-
# required because we move tensors with non_blocking=True, which is slightly faster for model loading.
21582153
empty_device_cache()
2159-
device_synchronize()
21602154
else:
21612155
model.load_state_dict(diffusers_format_checkpoint)
21622156

src/diffusers/loaders/transformer_flux.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
)
2020
from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
2121
from ..utils import is_accelerate_available, is_torch_version, logging
22-
from ..utils.torch_utils import device_synchronize, empty_device_cache
22+
from ..utils.torch_utils import empty_device_cache
2323

2424

2525
if is_accelerate_available():
@@ -82,7 +82,6 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
8282
device_map = {"": self.device}
8383
load_model_dict_into_meta(image_projection, updated_state_dict, device_map=device_map, dtype=self.dtype)
8484
empty_device_cache()
85-
device_synchronize()
8685

8786
return image_projection
8887

@@ -156,7 +155,6 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=_
156155
key_id += 1
157156

158157
empty_device_cache()
159-
device_synchronize()
160158

161159
return attn_procs
162160

src/diffusers/loaders/transformer_sd3.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from ..models.embeddings import IPAdapterTimeImageProjection
1919
from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
2020
from ..utils import is_accelerate_available, is_torch_version, logging
21-
from ..utils.torch_utils import device_synchronize, empty_device_cache
21+
from ..utils.torch_utils import empty_device_cache
2222

2323

2424
logger = logging.get_logger(__name__)
@@ -82,7 +82,6 @@ def _convert_ip_adapter_attn_to_diffusers(
8282
)
8383

8484
empty_device_cache()
85-
device_synchronize()
8685

8786
return attn_procs
8887

@@ -152,7 +151,6 @@ def _convert_ip_adapter_image_proj_to_diffusers(
152151
device_map = {"": self.device}
153152
load_model_dict_into_meta(image_proj, updated_state_dict, device_map=device_map, dtype=self.dtype)
154153
empty_device_cache()
155-
device_synchronize()
156154

157155
return image_proj
158156

src/diffusers/loaders/unet.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
is_torch_version,
4444
logging,
4545
)
46-
from ..utils.torch_utils import device_synchronize, empty_device_cache
46+
from ..utils.torch_utils import empty_device_cache
4747
from .lora_base import _func_optionally_disable_offloading
4848
from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE, TEXT_ENCODER_NAME, UNET_NAME
4949
from .utils import AttnProcsLayers
@@ -755,7 +755,6 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
755755
device_map = {"": self.device}
756756
load_model_dict_into_meta(image_projection, updated_state_dict, device_map=device_map, dtype=self.dtype)
757757
empty_device_cache()
758-
device_synchronize()
759758

760759
return image_projection
761760

@@ -854,7 +853,6 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=_
854853
key_id += 2
855854

856855
empty_device_cache()
857-
device_synchronize()
858856

859857
return attn_procs
860858

src/diffusers/models/modeling_utils.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
load_or_create_model_card,
6363
populate_model_card,
6464
)
65-
from ..utils.torch_utils import device_synchronize, empty_device_cache
65+
from ..utils.torch_utils import empty_device_cache
6666
from .model_loading_utils import (
6767
_caching_allocator_warmup,
6868
_determine_device_map,
@@ -1590,10 +1590,7 @@ def _load_pretrained_model(
15901590
assign_to_params_buffers = check_support_param_buffer_assignment(model, state_dict)
15911591
error_msgs += _load_state_dict_into_model(model, state_dict, assign_to_params_buffers)
15921592

1593-
# Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
1594-
# required because we move tensors with non_blocking=True, which is slightly faster for model loading.
15951593
empty_device_cache()
1596-
device_synchronize()
15971594

15981595
if offload_index is not None and len(offload_index) > 0:
15991596
save_offload_index(offload_index, offload_folder)
@@ -1930,4 +1927,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
19301927
# resolve remapping
19311928
remapped_class = _fetch_remapped_cls_from_config(config, cls)
19321929

1933-
return remapped_class.from_pretrained(pretrained_model_name_or_path, **kwargs_copy)
1930+
if remapped_class is cls:
1931+
return super(LegacyModelMixin, remapped_class).from_pretrained(
1932+
pretrained_model_name_or_path, **kwargs_copy
1933+
)
1934+
else:
1935+
return remapped_class.from_pretrained(pretrained_model_name_or_path, **kwargs_copy)

0 commit comments

Comments
 (0)