up

sayakpaul · sayakpaul · commit dca63885dda6 · 2025-08-12T21:02:01.000+05:30
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -359,7 +359,6 @@ def _load_shard_file(
     ignore_mismatched_sizes=False,
     low_cpu_mem_usage=False,
 ):
-    assign_to_params_buffers = None
     state_dict = load_state_dict(shard_file, dduf_entries=dduf_entries)
     mismatched_keys = _find_mismatched_keys(
         state_dict,
@@ -383,8 +382,7 @@ def _load_shard_file(
             state_dict_folder=state_dict_folder,
         )
     else:
-        if assign_to_params_buffers is None:
-            assign_to_params_buffers = check_support_param_buffer_assignment(model, state_dict)
+        assign_to_params_buffers = check_support_param_buffer_assignment(model, state_dict)
 
         error_msgs += _load_state_dict_into_model(model, state_dict, assign_to_params_buffers)
     return offload_index, state_dict_index, mismatched_keys, error_msgs
@@ -408,9 +406,8 @@ def _load_shard_files_with_threadpool(
     ignore_mismatched_sizes=False,
     low_cpu_mem_usage=False,
 ):
-    num_workers = int(os.environ.get("HF_PARALLEL_LOADING_WORKERS", str(DEFAULT_HF_PARALLEL_LOADING_WORKERS)))
-
     # Do not spawn anymore workers than you need
+    num_workers = int(os.environ.get("HF_PARALLEL_LOADING_WORKERS", str(DEFAULT_HF_PARALLEL_LOADING_WORKERS)))
     num_workers = min(len(shard_files), num_workers)
 
     logger.info(f"Loading model weights in parallel with {num_workers} workers...")
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
@@ -1428,6 +1428,39 @@ def test_sharded_checkpoints_with_variant(self):
 
             self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
 
+    @require_torch_accelerator
+    def test_sharded_checkpoints_with_parallel_loading(self):
+        torch.manual_seed(0)
+        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**config).eval()
+        model = model.to(torch_device)
+
+        base_output = model(**inputs_dict)
+
+        model_size = compute_module_persistent_sizes(model)[""]
+        max_shard_size = int((model_size * 0.75) / (2**10))  # Convert to KB as these test models are small.
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
+
+            # Now check if the right number of shards exists. First, let's get the number of shards.
+            # Since this number can be dependent on the model being tested, it's important that we calculate it
+            # instead of hardcoding it.
+            expected_num_shards = caculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
+            actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
+            self.assertTrue(actual_num_shards == expected_num_shards)
+
+            # Load with parallel loading
+            os.environ["HF_ENABLE_PARALLEL_LOADING"] = "yes"
+            new_model = self.model_class.from_pretrained(tmp_dir).eval()
+            new_model = new_model.to(torch_device)
+
+            torch.manual_seed(0)
+            if "generator" in inputs_dict:
+                _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+            new_output = new_model(**inputs_dict)
+            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
+
     @require_torch_accelerator
     def test_sharded_checkpoints_device_map(self):
         if self.model_class._no_split_modules is None: