huggingface
diff --git a/‎docs/source/en/training/distributed_inference.md
Lines changed: 73 additions & 0 deletions b/‎docs/source/en/training/distributed_inference.md
Lines changed: 73 additions & 0 deletions
diff --git a/‎src/diffusers/models/modeling_utils.py
Lines changed: 1 addition & 0 deletions b/‎src/diffusers/models/modeling_utils.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/diffusers/pipelines/pipeline_loading_utils.py
Lines changed: 229 additions & 5 deletions b/‎src/diffusers/pipelines/pipeline_loading_utils.py
Lines changed: 229 additions & 5 deletions
@@ -52,6 +52,79 @@ To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](h
 
 </Tip>
 
+### Device placement
+
+> [!WARNING]
+> This feature is experimental and its APIs might change in the future. 
+
+With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU.
+
+For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because:
+
+* it only works on a single GPU
+* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU)
+
+To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs.
+
+> [!TIP]
+> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future.
+
+```diff
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(
+-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
++    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
+)
+image = pipeline("a dog").images[0]
+image
+```
+
+> [!WARNING]  
+> Currently, we support only "balanced" `device_map`. We plan to support more device mapping strategies in future.
+
+You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device:
+
+```diff
+from diffusers import DiffusionPipeline
+import torch
+
+max_memory = {0:"1GB", 1:"1GB"}
+pipeline = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16, 
+    use_safetensors=True, 
+    device_map="balanced",
++   max_memory=max_memory
+)
+image = pipeline("a dog").images[0]
+image
+```
+
+If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement. 
+
+By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`].
+
+Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
+
+```py
+pipeline.reset_device_map()
+```
+
+Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`:
+
+```py
+print(pipeline.hf_device_map)
+```
+
+An example device map would look like so:
+
+
+```bash
+{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
+```
+
 ## PyTorch Distributed
 
 PyTorch supports [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) which enables data parallelism.
 
@@ -699,6 +699,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             offload_folder=offload_folder,
                             offload_state_dict=offload_state_dict,
                             dtype=torch_dtype,
+                            force_hooks=True,
                         )
                     except AttributeError as e:
                         # When using accelerate loading, we do not have the ability to load the state
 
@@ -22,15 +22,19 @@
 from typing import Any, Dict, List, Optional, Union
 
 import torch
-from huggingface_hub import (
-    model_info,
-)
+from huggingface_hub import model_info
+from huggingface_hub.utils import validate_hf_hub_args
 from packaging import version
 
+from .. import __version__
 from ..utils import (
+    FLAX_WEIGHTS_NAME,
+    ONNX_EXTERNAL_WEIGHTS_NAME,
+    ONNX_WEIGHTS_NAME,
     SAFETENSORS_WEIGHTS_NAME,
     WEIGHTS_NAME,
     get_class_from_dynamic_module,
+    is_accelerate_available,
     is_peft_available,
     is_transformers_available,
     logging,
@@ -44,9 +48,12 @@
     from transformers.utils import FLAX_WEIGHTS_NAME as TRANSFORMERS_FLAX_WEIGHTS_NAME
     from transformers.utils import SAFE_WEIGHTS_NAME as TRANSFORMERS_SAFE_WEIGHTS_NAME
     from transformers.utils import WEIGHTS_NAME as TRANSFORMERS_WEIGHTS_NAME
-from huggingface_hub.utils import validate_hf_hub_args
 
-from ..utils import FLAX_WEIGHTS_NAME, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME
+if is_accelerate_available():
+    import accelerate
+    from accelerate import dispatch_model
+    from accelerate.hooks import remove_hook_from_module
+    from accelerate.utils import compute_module_sizes, get_max_memory
 
 
 INDEX_FILE = "diffusion_pytorch_model.bin"
@@ -376,6 +383,207 @@ def _get_pipeline_class(
     return pipeline_cls
 
 
+def _load_empty_model(
+    library_name: str,
+    class_name: str,
+    importable_classes: List[Any],
+    pipelines: Any,
+    is_pipeline_module: bool,
+    name: str,
+    torch_dtype: Union[str, torch.dtype],
+    cached_folder: Union[str, os.PathLike],
+    **kwargs,
+):
+    # retrieve class objects.
+    class_obj, _ = get_class_obj_and_candidates(
+        library_name,
+        class_name,
+        importable_classes,
+        pipelines,
+        is_pipeline_module,
+        component_name=name,
+        cache_dir=cached_folder,
+    )
+
+    if is_transformers_available():
+        transformers_version = version.parse(version.parse(transformers.__version__).base_version)
+    else:
+        transformers_version = "N/A"
+
+    # Determine library.
+    is_transformers_model = (
+        is_transformers_available()
+        and issubclass(class_obj, PreTrainedModel)
+        and transformers_version >= version.parse("4.20.0")
+    )
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    is_diffusers_model = issubclass(class_obj, diffusers_module.ModelMixin)
+
+    model = None
+    config_path = cached_folder
+    user_agent = {
+        "diffusers": __version__,
+        "file_type": "model",
+        "framework": "pytorch",
+    }
+
+    if is_diffusers_model:
+        # Load config and then the model on meta.
+        config, unused_kwargs, commit_hash = class_obj.load_config(
+            os.path.join(config_path, name),
+            cache_dir=cached_folder,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=kwargs.pop("force_download", False),
+            resume_download=kwargs.pop("resume_download", False),
+            proxies=kwargs.pop("proxies", None),
+            local_files_only=kwargs.pop("local_files_only", False),
+            token=kwargs.pop("token", None),
+            revision=kwargs.pop("revision", None),
+            subfolder=kwargs.pop("subfolder", None),
+            user_agent=user_agent,
+        )
+        with accelerate.init_empty_weights():
+            model = class_obj.from_config(config, **unused_kwargs)
+    elif is_transformers_model:
+        config_class = getattr(class_obj, "config_class", None)
+        if config_class is None:
+            raise ValueError("`config_class` cannot be None. Please double-check the model.")
+
+        config = config_class.from_pretrained(
+            cached_folder,
+            subfolder=name,
+            force_download=kwargs.pop("force_download", False),
+            resume_download=kwargs.pop("resume_download", False),
+            proxies=kwargs.pop("proxies", None),
+            local_files_only=kwargs.pop("local_files_only", False),
+            token=kwargs.pop("token", None),
+            revision=kwargs.pop("revision", None),
+            user_agent=user_agent,
+        )
+        with accelerate.init_empty_weights():
+            model = class_obj(config)
+
+    if model is not None:
+        model = model.to(dtype=torch_dtype)
+    return model
+
+
+def _assign_components_to_devices(
+    module_sizes: Dict[str, float], device_memory: Dict[str, float], device_mapping_strategy: str = "balanced"
+):
+    device_ids = list(device_memory.keys())
+    device_cycle = device_ids + device_ids[::-1]
+    device_memory = device_memory.copy()
+
+    device_id_component_mapping = {}
+    current_device_index = 0
+    for component in module_sizes:
+        device_id = device_cycle[current_device_index % len(device_cycle)]
+        component_memory = module_sizes[component]
+        curr_device_memory = device_memory[device_id]
+
+        # If the GPU doesn't fit the current component offload to the CPU.
+        if component_memory > curr_device_memory:
+            device_id_component_mapping["cpu"] = [component]
+        else:
+            if device_id not in device_id_component_mapping:
+                device_id_component_mapping[device_id] = [component]
+            else:
+                device_id_component_mapping[device_id].append(component)
+
+            # Update the device memory.
+            device_memory[device_id] -= component_memory
+            current_device_index += 1
+
+    return device_id_component_mapping
+
+
+def _get_final_device_map(device_map, pipeline_class, passed_class_obj, init_dict, library, max_memory, **kwargs):
+    # To avoid circular import problem.
+    from diffusers import pipelines
+
+    torch_dtype = kwargs.get("torch_dtype", torch.float32)
+
+    # Load each module in the pipeline on a meta device so that we can derive the device map.
+    init_empty_modules = {}
+    for name, (library_name, class_name) in init_dict.items():
+        if class_name.startswith("Flax"):
+            raise ValueError("Flax pipelines are not supported with `device_map`.")
+
+        # Define all importable classes
+        is_pipeline_module = hasattr(pipelines, library_name)
+        importable_classes = ALL_IMPORTABLE_CLASSES
+        loaded_sub_model = None
+
+        # Use passed sub model or load class_name from library_name
+        if name in passed_class_obj:
+            # if the model is in a pipeline module, then we load it from the pipeline
+            # check that passed_class_obj has correct parent class
+            maybe_raise_or_warn(
+                library_name,
+                library,
+                class_name,
+                importable_classes,
+                passed_class_obj,
+                name,
+                is_pipeline_module,
+            )
+            with accelerate.init_empty_weights():
+                loaded_sub_model = passed_class_obj[name]
+
+        else:
+            loaded_sub_model = _load_empty_model(
+                library_name=library_name,
+                class_name=class_name,
+                importable_classes=importable_classes,
+                pipelines=pipelines,
+                is_pipeline_module=is_pipeline_module,
+                pipeline_class=pipeline_class,
+                name=name,
+                torch_dtype=torch_dtype,
+                cached_folder=kwargs.get("cached_folder", None),
+                force_download=kwargs.get("force_download", None),
+                resume_download=kwargs.get("resume_download", None),
+                proxies=kwargs.get("proxies", None),
+                local_files_only=kwargs.get("local_files_only", None),
+                token=kwargs.get("token", None),
+                revision=kwargs.get("revision", None),
+            )
+
+        if loaded_sub_model is not None:
+            init_empty_modules[name] = loaded_sub_model
+
+    # determine device map
+    # Obtain a sorted dictionary for mapping the model-level components
+    # to their sizes.
+    module_sizes = {
+        module_name: compute_module_sizes(module, dtype=torch_dtype)[""]
+        for module_name, module in init_empty_modules.items()
+        if isinstance(module, torch.nn.Module)
+    }
+    module_sizes = dict(sorted(module_sizes.items(), key=lambda item: item[1], reverse=True))
+
+    # Obtain maximum memory available per device (GPUs only).
+    max_memory = get_max_memory(max_memory)
+    max_memory = dict(sorted(max_memory.items(), key=lambda item: item[1], reverse=True))
+    max_memory = {k: v for k, v in max_memory.items() if k != "cpu"}
+
+    # Obtain a dictionary mapping the model-level components to the available
+    # devices based on the maximum memory and the model sizes.
+    device_id_component_mapping = _assign_components_to_devices(
+        module_sizes, max_memory, device_mapping_strategy=device_map
+    )
+
+    # Obtain the final device map, e.g., `{"unet": 0, "text_encoder": 1, "vae": 1, ...}`
+    final_device_map = {}
+    for device_id, components in device_id_component_mapping.items():
+        for component in components:
+            final_device_map[component] = device_id
+
+    return final_device_map
+
+
 def load_sub_model(
     library_name: str,
     class_name: str,
@@ -493,6 +701,22 @@ def load_sub_model(
         # else load from the root directory
         loaded_sub_model = load_method(cached_folder, **loading_kwargs)
 
+    if isinstance(loaded_sub_model, torch.nn.Module) and isinstance(device_map, dict):
+        # remove hooks
+        remove_hook_from_module(loaded_sub_model, recurse=True)
+        needs_offloading_to_cpu = device_map[""] == "cpu"
+
+        if needs_offloading_to_cpu:
+            dispatch_model(
+                loaded_sub_model,
+                state_dict=loaded_sub_model.state_dict(),
+                device_map=device_map,
+                force_hooks=True,
+                main_device=0,
+            )
+        else:
+            dispatch_model(loaded_sub_model, device_map=device_map, force_hooks=True)
+
     return loaded_sub_model
Original file line number	Diff line number	Diff line change
`@@ -699,6 +699,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P`
`699`	`699`	`offload_folder=offload_folder,`
`700`	`700`	`offload_state_dict=offload_state_dict,`
`701`	`701`	`dtype=torch_dtype,`
	`702`	`+ force_hooks=True,`
`702`	`703`	`)`
`703`	`704`	`except AttributeError as e:`
`704`	`705`	`# When using accelerate loading, we do not have the ability to load the state`