Merge branch 'main' into cuda-device-map-pipe

sayakpaul · web-flow · commit 736971c6e761 · 2025-08-14T09:39:51.000+05:30
diff --git a/docs/source/en/using-diffusers/loading.md b/docs/source/en/using-diffusers/loading.md
@@ -112,6 +112,30 @@ print(pipe.transformer.dtype, pipe.vae.dtype)  # (torch.bfloat16, torch.float16)
 
 If a component is not explicitly specified in the dictionary and no `default` is provided, it will be loaded with `torch.float32`.
 
+### Parallel loading
+
+Large models are often [sharded](../training/distributed_inference#model-sharding) into smaller files so that they are easier to load. Diffusers supports loading shards in parallel to speed up the loading process.
+
+Set the environment variables below to enable parallel loading.
+
+- Set `HF_ENABLE_PARALLEL_LOADING` to `"YES"` to enable parallel loading of shards.
+- Set `HF_PARALLEL_LOADING_WORKERS` to configure the number of parallel threads to use when loading shards. More workers loads a model faster but uses more memory.
+
+The `device_map` argument should be set to `"cuda"` to pre-allocate a large chunk of memory based on the model size. This substantially reduces model load time because warming up the memory allocator now avoids many smaller calls to the allocator later.
+
+```py
+import os
+import torch
+from diffusers import DiffusionPipeline
+
+os.environ["HF_ENABLE_PARALLEL_LOADING"] = "YES"
+pipeline = DiffusionPipeline.from_pretrained(
+    "Wan-AI/Wan2.2-I2V-A14B-Diffusers",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda"
+)
+```
+
 ### Local pipeline
 
 To load a pipeline locally, use [git-lfs](https://git-lfs.github.com/) to manually download a checkpoint to your local disk.
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -42,9 +42,8 @@
 from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
     CONFIG_NAME,
-    ENV_VARS_TRUE_VALUES,
     FLAX_WEIGHTS_NAME,
-    HF_PARALLEL_LOADING_FLAG,
+    HF_ENABLE_PARALLEL_LOADING,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_WEIGHTS_NAME,
     WEIGHTS_INDEX_NAME,
@@ -962,7 +961,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None)
         disable_mmap = kwargs.pop("disable_mmap", False)
 
-        is_parallel_loading_enabled = os.environ.get(HF_PARALLEL_LOADING_FLAG, "").upper() in ENV_VARS_TRUE_VALUES
+        is_parallel_loading_enabled = HF_ENABLE_PARALLEL_LOADING
         if is_parallel_loading_enabled and not low_cpu_mem_usage:
             raise NotImplementedError("Parallel loading is not supported when not using `low_cpu_mem_usage`.")
 
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
@@ -25,8 +25,8 @@
     DIFFUSERS_DYNAMIC_MODULE_NAME,
     FLAX_WEIGHTS_NAME,
     GGUF_FILE_EXTENSION,
+    HF_ENABLE_PARALLEL_LOADING,
     HF_MODULES_CACHE,
-    HF_PARALLEL_LOADING_FLAG,
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     MIN_PEFT_VERSION,
     ONNX_EXTERNAL_WEIGHTS_NAME,
diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py
@@ -44,7 +44,7 @@
 DIFFUSERS_ATTN_BACKEND = os.getenv("DIFFUSERS_ATTN_BACKEND", "native")
 DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0") in ENV_VARS_TRUE_VALUES
 DEFAULT_HF_PARALLEL_LOADING_WORKERS = 8
-HF_PARALLEL_LOADING_FLAG = "HF_ENABLE_PARALLEL_LOADING"
+HF_ENABLE_PARALLEL_LOADING = os.environ.get("HF_ENABLE_PARALLEL_LOADING", "").upper() in ENV_VARS_TRUE_VALUES
 
 # Below should be `True` if the current version of `peft` and `transformers` are compatible with
 # PEFT backend. Will automatically fall back to PEFT backend if the correct versions of the libraries are