add comment explanations

a-r-r-o-w · a-r-r-o-w · commit ea446b117b10 · 2025-07-10T11:22:41.000+02:00
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -431,6 +431,8 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 keep_in_fp32_modules=keep_in_fp32_modules,
                 unexpected_keys=unexpected_keys,
             )
+            # Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
+            # required because we move tensors with non_blocking=True, which is slightly faster for model loading.
             empty_device_cache()
             device_synchronize()
         else:
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
@@ -1690,6 +1690,8 @@ def create_diffusers_clip_model_from_ldm(
 
     if is_accelerate_available():
         load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+        # Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
+        # required because we move tensors with non_blocking=True, which is slightly faster for model loading.
         empty_device_cache()
         device_synchronize()
     else:
@@ -2151,6 +2153,8 @@ def create_diffusers_t5_model_from_checkpoint(
 
     if is_accelerate_available():
         load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+        # Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
+        # required because we move tensors with non_blocking=True, which is slightly faster for model loading.
         empty_device_cache()
         device_synchronize()
     else:
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -1486,8 +1486,14 @@ def _load_pretrained_model(
             if offload_state_dict is None:
                 offload_state_dict = True
 
-        # Caching allocator warmup
-        if device_map is not None:
+        # If a device map has been used, we can speedup the load time by warming up the device caching allocator.
+        # If we don't warmup, each tensor allocation on device calls to the allocator for memory (effectively, a
+        # lot of individual calls to device malloc). We can, however, preallocate the memory required by the
+        # tensors using their expected shape and not performing any initialization of the memory (empty data).
+        # When the actual device allocations happen, the allocator already has a pool of unused device memory
+        # that it can re-use for faster loading of the model.
+        # TODO: add support for warmup with hf_quantizer
+        if device_map is not None and hf_quantizer is None:
             expanded_device_map = _expand_device_map(device_map, expected_keys)
             _caching_allocator_warmup(model, expanded_device_map, dtype)
 
@@ -1534,6 +1540,8 @@ def _load_pretrained_model(
                     assign_to_params_buffers = check_support_param_buffer_assignment(model, state_dict)
                 error_msgs += _load_state_dict_into_model(model, state_dict, assign_to_params_buffers)
 
+        # Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is
+        # required because we move tensors with non_blocking=True, which is slightly faster for model loading.
         empty_device_cache()
         device_synchronize()
 

Original file line number	Diff line number	Diff line change
`@@ -431,6 +431,8 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =`
`431`	`431`	`keep_in_fp32_modules=keep_in_fp32_modules,`
`432`	`432`	`unexpected_keys=unexpected_keys,`
`433`	`433`	`)`
	`434`	`+ # Ensure tensors are correctly placed on device by synchronizing before returning control to user. This is`
	`435`	`+ # required because we move tensors with non_blocking=True, which is slightly faster for model loading.`
`434`	`436`	`empty_device_cache()`
`435`	`437`	`device_synchronize()`
`436`	`438`	`else:`