update

a-r-r-o-w · a-r-r-o-w · commit 9e4873becc4a · 2025-07-10T04:49:50.000+02:00
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -24,6 +24,7 @@
 from .. import __version__
 from ..quantizers import DiffusersAutoQuantizer
 from ..utils import deprecate, is_accelerate_available, logging
+from ..utils.torch_utils import device_synchronize, empty_device_cache
 from .single_file_utils import (
     SingleFileComponentError,
     convert_animatediff_checkpoint_to_diffusers,
@@ -430,6 +431,8 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 keep_in_fp32_modules=keep_in_fp32_modules,
                 unexpected_keys=unexpected_keys,
             )
+            empty_device_cache()
+            device_synchronize()
         else:
             _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
 
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -46,6 +46,7 @@
 )
 from ..utils.constants import DIFFUSERS_REQUEST_TIMEOUT
 from ..utils.hub_utils import _get_model_file
+from ..utils.torch_utils import device_synchronize, empty_device_cache
 
 
 if is_transformers_available():
@@ -1689,6 +1690,8 @@ def create_diffusers_clip_model_from_ldm(
 
     if is_accelerate_available():
         load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+        empty_device_cache()
+        device_synchronize()
     else:
         model.load_state_dict(diffusers_format_checkpoint, strict=False)
 
@@ -2148,6 +2151,8 @@ def create_diffusers_t5_model_from_checkpoint(
 
     if is_accelerate_available():
         load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+        empty_device_cache()
+        device_synchronize()
     else:
         model.load_state_dict(diffusers_format_checkpoint)
 
diff --git a/src/diffusers/loaders/transformer_flux.py b/src/diffusers/loaders/transformer_flux.py
@@ -18,11 +18,8 @@
     MultiIPAdapterImageProjection,
 )
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
-from ..utils import (
-    is_accelerate_available,
-    is_torch_version,
-    logging,
-)
+from ..utils import is_accelerate_available, is_torch_version, logging
+from ..utils.torch_utils import device_synchronize, empty_device_cache
 
 
 if is_accelerate_available():
@@ -84,6 +81,8 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
         else:
             device_map = {"": self.device}
             load_model_dict_into_meta(image_projection, updated_state_dict, device_map=device_map, dtype=self.dtype)
+            empty_device_cache()
+            device_synchronize()
 
         return image_projection
 
@@ -158,6 +157,9 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=_
 
                 key_id += 1
 
+        empty_device_cache()
+        device_synchronize()
+
         return attn_procs
 
     def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=_LOW_CPU_MEM_USAGE_DEFAULT):
diff --git a/src/diffusers/loaders/transformer_sd3.py b/src/diffusers/loaders/transformer_sd3.py
@@ -18,6 +18,7 @@
 from ..models.embeddings import IPAdapterTimeImageProjection
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from ..utils import is_accelerate_available, is_torch_version, logging
+from ..utils.torch_utils import device_synchronize, empty_device_cache
 
 
 logger = logging.get_logger(__name__)
@@ -80,6 +81,9 @@ def _convert_ip_adapter_attn_to_diffusers(
                     attn_procs[name], layer_state_dict[idx], device_map=device_map, dtype=self.dtype
                 )
 
+        empty_device_cache()
+        device_synchronize()
+
         return attn_procs
 
     def _convert_ip_adapter_image_proj_to_diffusers(
@@ -147,6 +151,8 @@ def _convert_ip_adapter_image_proj_to_diffusers(
         else:
             device_map = {"": self.device}
             load_model_dict_into_meta(image_proj, updated_state_dict, device_map=device_map, dtype=self.dtype)
+            empty_device_cache()
+            device_synchronize()
 
         return image_proj
 
diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -44,6 +44,7 @@
     is_torch_version,
     logging,
 )
+from ..utils.torch_utils import device_synchronize, empty_device_cache
 from .lora_base import _func_optionally_disable_offloading
 from .lora_pipeline import LORA_WEIGHT_NAME, LORA_WEIGHT_NAME_SAFE, TEXT_ENCODER_NAME, UNET_NAME
 from .utils import AttnProcsLayers
@@ -752,6 +753,8 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_us
         else:
             device_map = {"": self.device}
             load_model_dict_into_meta(image_projection, updated_state_dict, device_map=device_map, dtype=self.dtype)
+            empty_device_cache()
+            device_synchronize()
 
         return image_projection
 
@@ -849,6 +852,9 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=_
 
                 key_id += 2
 
+        empty_device_cache()
+        device_synchronize()
+
         return attn_procs
 
     def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=_LOW_CPU_MEM_USAGE_DEFAULT):
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -231,16 +231,6 @@ def load_model_dict_into_meta(
 
     is_quantized = hf_quantizer is not None
     empty_state_dict = model.state_dict()
-    expanded_device_map = {}
-
-    # if device_map is not None:
-    #     for param_name, param in state_dict.items():
-    #         if param_name not in empty_state_dict:
-    #             continue
-    #         param_device = _determine_param_device(param_name, device_map)
-    #         expanded_device_map[param_name] = param_device
-    #     print(expanded_device_map)
-    #     _caching_allocator_warmup(model, expanded_device_map, dtype)
 
     for param_name, param in state_dict.items():
         if param_name not in empty_state_dict:
@@ -310,7 +300,15 @@ def load_model_dict_into_meta(
                 model, param, param_name, param_device, state_dict, unexpected_keys, dtype=dtype
             )
         else:
-            set_module_tensor_to_device(model, param_name, param_device, value=param, **set_module_kwargs)
+            set_module_tensor_to_device(
+                model,
+                param_name,
+                param_device,
+                value=param,
+                non_blocking=True,
+                _empty_cache=False,
+                **set_module_kwargs,
+            )
 
     return offload_index, state_dict_index
 
@@ -533,6 +531,41 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     return parsed_parameters
 
 
+def _find_mismatched_keys(
+    state_dict,
+    model_state_dict,
+    loaded_keys,
+    ignore_mismatched_sizes,
+):
+    mismatched_keys = []
+    if not ignore_mismatched_sizes:
+        return mismatched_keys
+    for checkpoint_key in loaded_keys:
+        model_key = checkpoint_key
+        # If the checkpoint is sharded, we may not have the key here.
+        if checkpoint_key not in state_dict:
+            continue
+
+        if model_key in model_state_dict and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape:
+            mismatched_keys.append(
+                (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+            )
+            del state_dict[checkpoint_key]
+    return mismatched_keys
+
+
+def _expand_device_map(device_map, param_names):
+    """
+    Expand a device map to return the correspondence parameter name to device.
+    """
+    new_device_map = {}
+    for module, device in device_map.items():
+        new_device_map.update(
+            {p: device for p in param_names if p == module or p.startswith(f"{module}.") or module == ""}
+        )
+    return new_device_map
+
+
 # Adapted from: https://github.com/huggingface/transformers/blob/0687d481e2c71544501ef9cb3eef795a6e79b1de/src/transformers/modeling_utils.py#L5859
 def _caching_allocator_warmup(model, expanded_device_map: Dict[str, torch.device], dtype: torch.dtype) -> None:
     """This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -62,10 +62,14 @@
     load_or_create_model_card,
     populate_model_card,
 )
+from ..utils.torch_utils import device_synchronize, empty_device_cache
 from .model_loading_utils import (
+    _caching_allocator_warmup,
     _determine_device_map,
+    _expand_device_map,
     _fetch_index_file,
     _fetch_index_file_legacy,
+    _find_mismatched_keys,
     _load_state_dict_into_model,
     load_model_dict_into_meta,
     load_state_dict,
@@ -1469,11 +1473,6 @@ def _load_pretrained_model(
             for pat in cls._keys_to_ignore_on_load_unexpected:
                 unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
 
-        mismatched_keys = []
-
-        assign_to_params_buffers = None
-        error_msgs = []
-
         # Deal with offload
         if device_map is not None and "disk" in device_map.values():
             if offload_folder is None:
@@ -1482,18 +1481,21 @@ def _load_pretrained_model(
                     " for them. Alternatively, make sure you have `safetensors` installed if the model you are using"
                     " offers the weights in this format."
                 )
-            if offload_folder is not None:
+            else:
                 os.makedirs(offload_folder, exist_ok=True)
             if offload_state_dict is None:
                 offload_state_dict = True
 
+        # Caching allocator warmup
+        if device_map is not None:
+            expanded_device_map = _expand_device_map(device_map, expected_keys)
+            _caching_allocator_warmup(model, expanded_device_map, dtype)
+
         offload_index = {} if device_map is not None and "disk" in device_map.values() else None
+        state_dict_folder, state_dict_index = None, None
         if offload_state_dict:
             state_dict_folder = tempfile.mkdtemp()
             state_dict_index = {}
-        else:
-            state_dict_folder = None
-            state_dict_index = None
 
         if state_dict is not None:
             # load_state_dict will manage the case where we pass a dict instead of a file
@@ -1503,38 +1505,14 @@ def _load_pretrained_model(
         if len(resolved_model_file) > 1:
             resolved_model_file = logging.tqdm(resolved_model_file, desc="Loading checkpoint shards")
 
+        mismatched_keys = []
+        assign_to_params_buffers = None
+        error_msgs = []
+
         for shard_file in resolved_model_file:
             state_dict = load_state_dict(shard_file, dduf_entries=dduf_entries)
-
-            def _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                loaded_keys,
-                ignore_mismatched_sizes,
-            ):
-                mismatched_keys = []
-                if ignore_mismatched_sizes:
-                    for checkpoint_key in loaded_keys:
-                        model_key = checkpoint_key
-                        # If the checkpoint is sharded, we may not have the key here.
-                        if checkpoint_key not in state_dict:
-                            continue
-
-                        if (
-                            model_key in model_state_dict
-                            and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                        ):
-                            mismatched_keys.append(
-                                (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                            )
-                            del state_dict[checkpoint_key]
-                return mismatched_keys
-
             mismatched_keys += _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                loaded_keys,
-                ignore_mismatched_sizes,
+                state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes
             )
 
             if low_cpu_mem_usage:
@@ -1554,11 +1532,11 @@ def _find_mismatched_keys(
             else:
                 if assign_to_params_buffers is None:
                     assign_to_params_buffers = check_support_param_buffer_assignment(model, state_dict)
-
                 error_msgs += _load_state_dict_into_model(model, state_dict, assign_to_params_buffers)
 
-        torch.cuda.synchronize()
-        
+        empty_device_cache()
+        device_synchronize()
+
         if offload_index is not None and len(offload_index) > 0:
             save_offload_index(offload_index, offload_folder)
             offload_index = None
diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
@@ -182,5 +182,14 @@ def get_device():
 def empty_device_cache(device_type: Optional[str] = None):
     if device_type is None:
         device_type = get_device()
+    if device_type in ["cpu"]:
+        return
     device_mod = getattr(torch, device_type, torch.cuda)
     device_mod.empty_cache()
+
+
+def device_synchronize(device_type: Optional[str] = None):
+    if device_type is None:
+        device_type = get_device()
+    device_mod = getattr(torch, device_type, torch.cuda)
+    device_mod.synchronize()