From 30d924c6412bae74bb5767de04d25500a3166891 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 27 Aug 2025 15:22:14 -0700 Subject: [PATCH 1/4] init --- docs/source/en/_toctree.yml | 2 + docs/source/en/using-diffusers/loading.md | 39 ++----- docs/source/en/using-diffusers/models.md | 123 ++++++++++++++++++++++ 3 files changed, 131 insertions(+), 33 deletions(-) create mode 100644 docs/source/en/using-diffusers/models.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index b33989aed0e1..14dbfe3ea1d3 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -24,6 +24,8 @@ title: Reproducibility - local: using-diffusers/schedulers title: Load schedulers and models + - local: using-diffusers/models + title: Models - local: using-diffusers/scheduler_features title: Scheduler features - local: using-diffusers/other-formats diff --git a/docs/source/en/using-diffusers/loading.md b/docs/source/en/using-diffusers/loading.md index f86ea104cf69..f1a997889061 100644 --- a/docs/source/en/using-diffusers/loading.md +++ b/docs/source/en/using-diffusers/loading.md @@ -108,23 +108,20 @@ print(pipeline.transformer.dtype, pipeline.vae.dtype) The `device_map` argument determines individual model or pipeline placement on an accelerator like a GPU. It is especially helpful when there are multiple GPUs. -Diffusers currently provides three options to `device_map`, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies. +A pipeline supports two options for `device_map`, `"cuda"` and `"balanced"`. Refer to the table below to compare the placement strategies. | parameter | description | |---|---| -| `"cuda"` | places model or pipeline on CUDA device | -| `"balanced"` | evenly distributes model or pipeline on all GPUs | -| `"auto"` | distribute model from fastest device first to slowest | +| `"cuda"` | places pipeline on CUDA device | +| `"balanced"` | evenly distributes pipeline on all GPUs | Use the `max_memory` argument in [`~DiffusionPipeline.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available. - - - ```py import torch from diffusers import DiffusionPipeline +max_memory = {0: "16GB", 1: "16GB"} pipeline = DiffusionPipeline.from_pretrained( "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, @@ -132,26 +129,6 @@ pipeline = DiffusionPipeline.from_pretrained( ) ``` - - - -```py -import torch -from diffusers import AutoModel - -max_memory = {0: "16GB", 1: "16GB"} -transformer = AutoModel.from_pretrained( - "Qwen/Qwen-Image", - subfolder="transformer", - torch_dtype=torch.bfloat16 - device_map="cuda", - max_memory=max_memory -) -``` - - - - The `hf_device_map` attribute allows you to access and view the `device_map`. ```py @@ -189,22 +166,18 @@ pipeline = DiffusionPipeline.from_pretrained( [`DiffusionPipeline`] is flexible and accommodates loading different models or schedulers. You can experiment with different schedulers to optimize for generation speed or quality, and you can replace models with more performant ones. -The example below swaps the default scheduler to generate higher quality images and a more stable VAE version. Pass the `subfolder` argument in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler to the correct subfolder. +The example below uses a more stable VAE version. ```py import torch -from diffusers import DiffusionPipeline, HeunDiscreteScheduler, AutoModel +from diffusers import DiffusionPipeline, AutoModel -scheduler = HeunDiscreteScheduler.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler" -) vae = AutoModel.from_pretrained( "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 ) pipeline = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", - scheduler=scheduler, vae=vae, torch_dtype=torch.float16, device_map="cuda" diff --git a/docs/source/en/using-diffusers/models.md b/docs/source/en/using-diffusers/models.md new file mode 100644 index 000000000000..4207e92e6209 --- /dev/null +++ b/docs/source/en/using-diffusers/models.md @@ -0,0 +1,123 @@ + + +[[open-in-colab]] + +## Models + +A diffusion model relies on a few individual models working together to generate an output. These models are responsible for denoising, encoding inputs, and decoding latents into the actual outputs. + +This guide will show you how to load models. + +## Loading a model + +All models are loaded with the [`~ModelMixin.from_pretrained`] method, which downloads and caches the latest model version. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache. + +Pass the `subfolder` argument to [`~ModelMixin.from_pretrained`] to specify where to load the model weights from. Omit the `subfolder` argument if the repository doesn't have a subfolder structure or if you're loading a standalone model. + +```py +from diffusers import QwenImageTransformer2DModel + +model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer") +``` + +## AutoModel + +[`AutoModel`] detects the model class from a `model_index.json` file or a model's `config.json` file. It fetches the correct model class from these files and delegates the actual loading to the model class. [`AutoModel`] is useful for automatic model type detection without needing to know the exact model class beforehand. + +```py +from diffusers import AutoModel + +model = AutoModel.from_pretrained( + "Qwen/Qwen-Image", + subfolder="transformer" +) +``` + +## Model data types + +Use the `torch_dtype` argument in [`~ModelMixin.from_pretrained`] to load a model with a specific data type. This allows you to load a model in a lower precision to reduce memory usage. + +```py +import torch +from diffusers import QwenImageTransformer2DModel + +model = QwenImageTransformer2DModel.from_pretrained( + "Qwen/Qwen-Image", + subfolder="transformer" + torch_dtype=torch.float16 +) +``` + +[torch.Tensor.to](https://docs.pytorch.org/docs/stable/generated/torch.Tensor.to.html) can also convert to a specific data type on the fly. However, it converts *all* weights to the requested data type unlike `torch_dtype` which respects `_keep_in_fp32_modules`. This argument preserves layers in `torch.float32` for numerical stability and best generation quality (see example [_keep_in_fp32_modules](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374)) + +```py +from diffusers import QwenImageTransformer2DModel + +model = QwenImageTransformer2DModel.from_pretrained( + "Qwen/Qwen-Image",, + subfolder="transformer" +) +model = model.to(dtype=torch.float16) +``` + +## Device placement + +Use the `device_map` argument in [`~ModelMixin.from_pretrained`] to place a model on an accelerator like a GPU. It is especially helpful where there are multiple GPUs. + +Diffusers currently provides three options to `device_map` for individual models, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies. + +| parameter | description | +|---|---| +| `"cuda"` | places pipeline on CUDA device | +| `"balanced"` | evenly distributes pipeline on all GPUs | +| `"auto"` | distribute model from fastest device first to slowest | + +Use the `max_memory` argument in [`~ModelMixin.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available. + +```py +import torch +from diffusers import QwenImageTransformer2DModel + +max_memory = {0: "16GB", 1: "16GB"} +transformer = QwenImageTransformer2DModel.from_pretrained( + "Qwen/Qwen-Image", + subfolder="transformer", + torch_dtype=torch.bfloat16, + device_map="cuda", + max_memory=max_memory +) +``` + +The `hf_device_map` attribute allows you to access and view the `device_map`. + +```py +print(transformer.hf_device_map) +# {'': device(type='cuda')} +``` + +## Saving models + +Save a model with the [`~ModelMixin.save_pretrained`] method. + +```py +from diffusers import QwenImageTransformer2DModel + +model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer") +model.save_pretrained("./local/model") +``` + +For large models, it is helpful to use `max_shard_size` to save a model as multiple shards. A shard can be loaded faster and save memory (refer to the [parallel loading](./loading#parallel-loading) docs for more details), especially if there is more than one GPU. + +```py +model.save_pretrained("./local/model", max_shard_size="5GB") +``` From 8816f528823a53f753d5c925e4df5eba85a8c831 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 27 Aug 2025 15:49:12 -0700 Subject: [PATCH 2/4] fix --- docs/source/en/using-diffusers/models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/using-diffusers/models.md b/docs/source/en/using-diffusers/models.md index 4207e92e6209..221742103cea 100644 --- a/docs/source/en/using-diffusers/models.md +++ b/docs/source/en/using-diffusers/models.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. [[open-in-colab]] -## Models +# Models A diffusion model relies on a few individual models working together to generate an output. These models are responsible for denoising, encoding inputs, and decoding latents into the actual outputs. From 89fc965addd0948f64bf9ea54df72e159a23708e Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 2 Sep 2025 16:40:48 -0700 Subject: [PATCH 3/4] feedback --- docs/source/en/using-diffusers/models.md | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/docs/source/en/using-diffusers/models.md b/docs/source/en/using-diffusers/models.md index 221742103cea..22c78d490ae4 100644 --- a/docs/source/en/using-diffusers/models.md +++ b/docs/source/en/using-diffusers/models.md @@ -38,8 +38,7 @@ model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder from diffusers import AutoModel model = AutoModel.from_pretrained( - "Qwen/Qwen-Image", - subfolder="transformer" + "Qwen/Qwen-Image", subfolder="transformer" ) ``` @@ -53,19 +52,18 @@ from diffusers import QwenImageTransformer2DModel model = QwenImageTransformer2DModel.from_pretrained( "Qwen/Qwen-Image", - subfolder="transformer" - torch_dtype=torch.float16 + subfolder="transformer", + torch_dtype=torch.bfloat16 ) ``` -[torch.Tensor.to](https://docs.pytorch.org/docs/stable/generated/torch.Tensor.to.html) can also convert to a specific data type on the fly. However, it converts *all* weights to the requested data type unlike `torch_dtype` which respects `_keep_in_fp32_modules`. This argument preserves layers in `torch.float32` for numerical stability and best generation quality (see example [_keep_in_fp32_modules](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374)) +[nn.Module.to](https://docs.pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to) can also convert to a specific data type on the fly. However, it converts *all* weights to the requested data type unlike `torch_dtype` which respects `_keep_in_fp32_modules`. This argument preserves layers in `torch.float32` for numerical stability and best generation quality (see example [_keep_in_fp32_modules](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374)) ```py from diffusers import QwenImageTransformer2DModel model = QwenImageTransformer2DModel.from_pretrained( - "Qwen/Qwen-Image",, - subfolder="transformer" + "Qwen/Qwen-Image", subfolder="transformer" ) model = model.to(dtype=torch.float16) ``` @@ -78,7 +76,7 @@ Diffusers currently provides three options to `device_map` for individual models | parameter | description | |---|---| -| `"cuda"` | places pipeline on CUDA device | +| `"cuda"` | places pipeline on a supported accelerator (CUDA) | | `"balanced"` | evenly distributes pipeline on all GPUs | | `"auto"` | distribute model from fastest device first to slowest | @@ -86,12 +84,11 @@ Use the `max_memory` argument in [`~ModelMixin.from_pretrained`] to allocate a m ```py import torch -from diffusers import QwenImageTransformer2DModel +from diffusers import QwenImagePipeline max_memory = {0: "16GB", 1: "16GB"} -transformer = QwenImageTransformer2DModel.from_pretrained( +pipeline = QwenImagePipeline.from_pretrained( "Qwen/Qwen-Image", - subfolder="transformer", torch_dtype=torch.bfloat16, device_map="cuda", max_memory=max_memory From c777df842aa262e783ef0df7805e973991b81797 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Fri, 5 Sep 2025 09:41:37 -0700 Subject: [PATCH 4/4] feedback --- docs/source/en/using-diffusers/loading.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/using-diffusers/loading.md b/docs/source/en/using-diffusers/loading.md index f1a997889061..25b53d2f4d49 100644 --- a/docs/source/en/using-diffusers/loading.md +++ b/docs/source/en/using-diffusers/loading.md @@ -112,7 +112,7 @@ A pipeline supports two options for `device_map`, `"cuda"` and `"balanced"`. Ref | parameter | description | |---|---| -| `"cuda"` | places pipeline on CUDA device | +| `"cuda"` | places pipeline on a supported accelerator device like CUDA | | `"balanced"` | evenly distributes pipeline on all GPUs | Use the `max_memory` argument in [`~DiffusionPipeline.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.