diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 55ce0cf79fb9..1e9e28471d89 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -814,14 +814,43 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P Mirror source to resolve accessibility issues if you're downloading a model in China. We do not guarantee the timeliness or safety of the source, and you should refer to the mirror site for more information. - device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): + device_map (`Union[int, str, torch.device]` or `Dict[str, Union[int, str, torch.device]]`, *optional*): A map that specifies where each submodule should go. It doesn't need to be defined for each parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the same device. Defaults to `None`, meaning that the model will be loaded on CPU. + Examples: + + ```py + >>> from diffusers import AutoModel + >>> import torch + + >>> # This works. + >>> model = AutoModel.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="cuda" + ... ) + >>> # This also works (integer accelerator device ID). + >>> model = AutoModel.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map=0 + ... ) + >>> # Specifying a supported offloading strategy like "auto" also works. + >>> model = AutoModel.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="auto" + ... ) + >>> # Specifying a dictionary as `device_map` also works. + >>> model = AutoModel.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", + ... subfolder="unet", + ... device_map={"": torch.device("cuda")}, + ... ) + ``` + Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For more information about each option see [designing a device - map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + map](https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap). You + can also refer to the [Diffusers-specific + documentation](https://huggingface.co/docs/diffusers/main/en/training/distributed_inference#model-sharding) + for more concrete examples. max_memory (`Dict`, *optional*): A dictionary device identifier for the maximum memory. Will default to the maximum memory available for each GPU and the available CPU RAM if unset. @@ -1387,7 +1416,7 @@ def _load_pretrained_model( low_cpu_mem_usage: bool = True, dtype: Optional[Union[str, torch.dtype]] = None, keep_in_fp32_modules: Optional[List[str]] = None, - device_map: Dict[str, Union[int, str, torch.device]] = None, + device_map: Union[str, int, torch.device, Dict[str, Union[int, str, torch.device]]] = None, offload_state_dict: Optional[bool] = None, offload_folder: Optional[Union[str, os.PathLike]] = None, dduf_entries: Optional[Dict[str, DDUFEntry]] = None, diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 0ac4251ec6d3..efeb085a723b 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -669,14 +669,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not guarantee the timeliness or safety of the source, and you should refer to the mirror site for more information. - device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): - A map that specifies where each submodule should go. It doesn’t need to be defined for each - parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the - same device. - - Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For - more information about each option see [designing a device - map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + device_map (`str`, *optional*): + Strategy that dictates how the different components of a pipeline should be placed on available + devices. Currently, only "balanced" `device_map` is supported. Check out + [this](https://huggingface.co/docs/diffusers/main/en/tutorials/inference_with_big_models#device-placement) + to know more. max_memory (`Dict`, *optional*): A dictionary device identifier for the maximum memory. Will default to the maximum memory available for each GPU and the available CPU RAM if unset. diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py index c8ed68c65b40..e0331d15dd04 100644 --- a/tests/models/unets/test_models_unet_2d_condition.py +++ b/tests/models/unets/test_models_unet_2d_condition.py @@ -46,6 +46,7 @@ require_peft_backend, require_torch_accelerator, require_torch_accelerator_with_fp16, + require_torch_gpu, skip_mps, slow, torch_all_close, @@ -1083,6 +1084,42 @@ def test_load_sharded_checkpoint_device_map_from_hub_local_subfolder(self): assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) + @parameterized.expand( + [ + (-1, "You can't pass device_map as a negative int"), + ("foo", "When passing device_map as a string, the value needs to be a device name"), + ] + ) + def test_wrong_device_map_raises_error(self, device_map, msg_substring): + with self.assertRaises(ValueError) as err_ctx: + _ = self.model_class.from_pretrained( + "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map=device_map + ) + + assert msg_substring in str(err_ctx.exception) + + @parameterized.expand([0, "cuda", torch.device("cuda"), torch.device("cuda:0")]) + @require_torch_gpu + def test_passing_non_dict_device_map_works(self, device_map): + _, inputs_dict = self.prepare_init_args_and_inputs_for_common() + loaded_model = self.model_class.from_pretrained( + "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map=device_map + ) + output = loaded_model(**inputs_dict) + assert output.sample.shape == (4, 4, 16, 16) + + @parameterized.expand([("", "cuda"), ("", torch.device("cuda"))]) + @require_torch_gpu + def test_passing_dict_device_map_works(self, name, device_map): + # There are other valid dict-based `device_map` values too. It's best to refer to + # the docs for those: https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap. + _, inputs_dict = self.prepare_init_args_and_inputs_for_common() + loaded_model = self.model_class.from_pretrained( + "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map={name: device_map} + ) + output = loaded_model(**inputs_dict) + assert output.sample.shape == (4, 4, 16, 16) + @require_peft_backend def test_load_attn_procs_raise_warning(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()