From 30d924c6412bae74bb5767de04d25500a3166891 Mon Sep 17 00:00:00 2001
From: stevhliu <steven.liu@huggingface.co>
Date: Wed, 27 Aug 2025 15:22:14 -0700
Subject: [PATCH 1/4] init

---
 docs/source/en/_toctree.yml               |   2 +
 docs/source/en/using-diffusers/loading.md |  39 ++-----
 docs/source/en/using-diffusers/models.md  | 123 ++++++++++++++++++++++
 3 files changed, 131 insertions(+), 33 deletions(-)
 create mode 100644 docs/source/en/using-diffusers/models.md
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b33989aed0e1..14dbfe3ea1d3 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -24,6 +24,8 @@
     title: Reproducibility
   - local: using-diffusers/schedulers
     title: Load schedulers and models
+  - local: using-diffusers/models
+    title: Models
   - local: using-diffusers/scheduler_features
     title: Scheduler features
   - local: using-diffusers/other-formats
diff --git a/docs/source/en/using-diffusers/loading.md b/docs/source/en/using-diffusers/loading.md
index f86ea104cf69..f1a997889061 100644
--- a/docs/source/en/using-diffusers/loading.md
+++ b/docs/source/en/using-diffusers/loading.md
@@ -108,23 +108,20 @@ print(pipeline.transformer.dtype, pipeline.vae.dtype)
 
 The `device_map` argument determines individual model or pipeline placement on an accelerator like a GPU. It is especially helpful when there are multiple GPUs.
 
-Diffusers currently provides three options to `device_map`, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies.
+A pipeline supports two options for `device_map`, `"cuda"` and `"balanced"`. Refer to the table below to compare the placement strategies.
 
 | parameter | description |
 |---|---|
-| `"cuda"` | places model or pipeline on CUDA device |
-| `"balanced"` | evenly distributes model or pipeline on all GPUs |
-| `"auto"` | distribute model from fastest device first to slowest |
+| `"cuda"` | places pipeline on CUDA device |
+| `"balanced"` | evenly distributes pipeline on all GPUs |
 
 Use the `max_memory` argument in [`~DiffusionPipeline.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.
 
-<hfoptions id="device_map">
-<hfoption id="pipeline">
-
 ```py
 import torch
 from diffusers import DiffusionPipeline
 
+max_memory = {0: "16GB", 1: "16GB"}
 pipeline = DiffusionPipeline.from_pretrained(
   "Qwen/Qwen-Image", 
   torch_dtype=torch.bfloat16,
@@ -132,26 +129,6 @@ pipeline = DiffusionPipeline.from_pretrained(
 )
 ```
 
-</hfoption>
-<hfoption id="individual model">
-
-```py
-import torch
-from diffusers import AutoModel
-
-max_memory = {0: "16GB", 1: "16GB"}
-transformer = AutoModel.from_pretrained(
-    "Qwen/Qwen-Image", 
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16
-    device_map="cuda",
-    max_memory=max_memory
-)
-```
-
-</hfoption>
-</hfoptions>
-
 The `hf_device_map` attribute allows you to access and view the `device_map`.
 
 ```py
@@ -189,22 +166,18 @@ pipeline = DiffusionPipeline.from_pretrained(
 
 [`DiffusionPipeline`] is flexible and accommodates loading different models or schedulers. You can experiment with different schedulers to optimize for generation speed or quality, and you can replace models with more performant ones.
 
-The example below swaps the default scheduler to generate higher quality images and a more stable VAE version. Pass the `subfolder` argument in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler to the correct subfolder.
+The example below uses a more stable VAE version.
 
 ```py
 import torch
-from diffusers import DiffusionPipeline, HeunDiscreteScheduler, AutoModel
+from diffusers import DiffusionPipeline, AutoModel
 
-scheduler = HeunDiscreteScheduler.from_pretrained(
-  "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"
-)
 vae = AutoModel.from_pretrained(
   "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
 )
 
 pipeline = DiffusionPipeline.from_pretrained(
   "stabilityai/stable-diffusion-xl-base-1.0",
-  scheduler=scheduler,
   vae=vae,
   torch_dtype=torch.float16,
   device_map="cuda"
diff --git a/docs/source/en/using-diffusers/models.md b/docs/source/en/using-diffusers/models.md
new file mode 100644
index 000000000000..4207e92e6209
--- /dev/null
+++ b/docs/source/en/using-diffusers/models.md
@@ -0,0 +1,123 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+[[open-in-colab]]
+
+## Models
+
+A diffusion model relies on a few individual models working together to generate an output. These models are responsible for denoising, encoding inputs, and decoding latents into the actual outputs.
+
+This guide will show you how to load models.
+
+## Loading a model
+
+All models are loaded with the [`~ModelMixin.from_pretrained`] method, which downloads and caches the latest model version. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache.
+
+Pass the `subfolder` argument to [`~ModelMixin.from_pretrained`] to specify where to load the model weights from. Omit the `subfolder` argument if the repository doesn't have a subfolder structure or if you're loading a standalone model.
+
+```py
+from diffusers import QwenImageTransformer2DModel
+
+model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer")
+```
+
+## AutoModel
+
+[`AutoModel`] detects the model class from a `model_index.json` file or a model's `config.json` file. It fetches the correct model class from these files and delegates the actual loading to the model class. [`AutoModel`] is useful for automatic model type detection without needing to know the exact model class beforehand.
+
+```py
+from diffusers import AutoModel
+
+model = AutoModel.from_pretrained(
+    "Qwen/Qwen-Image",
+    subfolder="transformer"
+)
+```
+
+## Model data types
+
+Use the `torch_dtype` argument in [`~ModelMixin.from_pretrained`] to load a model with a specific data type. This allows you to load a model in a lower precision to reduce memory usage.
+
+```py
+import torch
+from diffusers import QwenImageTransformer2DModel
+
+model = QwenImageTransformer2DModel.from_pretrained(
+    "Qwen/Qwen-Image",
+    subfolder="transformer"
+    torch_dtype=torch.float16
+)
+```
+
+[torch.Tensor.to](https://docs.pytorch.org/docs/stable/generated/torch.Tensor.to.html) can also convert to a specific data type on the fly. However, it converts *all* weights to the requested data type unlike `torch_dtype` which respects `_keep_in_fp32_modules`. This argument preserves layers in `torch.float32` for numerical stability and best generation quality (see example [_keep_in_fp32_modules](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374))
+
+```py
+from diffusers import QwenImageTransformer2DModel
+
+model = QwenImageTransformer2DModel.from_pretrained(
+    "Qwen/Qwen-Image",, 
+    subfolder="transformer"
+)
+model = model.to(dtype=torch.float16) 
+```
+
+## Device placement
+
+Use the `device_map` argument in [`~ModelMixin.from_pretrained`] to place a model on an accelerator like a GPU. It is especially helpful where there are multiple GPUs.
+
+Diffusers currently provides three options to `device_map` for individual models, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies.
+
+| parameter | description |
+|---|---|
+| `"cuda"` | places pipeline on CUDA device |
+| `"balanced"` | evenly distributes pipeline on all GPUs |
+| `"auto"` | distribute model from fastest device first to slowest |
+
+Use the `max_memory` argument in [`~ModelMixin.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.
+
+```py
+import torch
+from diffusers import QwenImageTransformer2DModel
+
+max_memory = {0: "16GB", 1: "16GB"}
+transformer = QwenImageTransformer2DModel.from_pretrained(
+    "Qwen/Qwen-Image", 
+    subfolder="transformer",
+    torch_dtype=torch.bfloat16,
+    device_map="cuda",
+    max_memory=max_memory
+)
+```
+
+The `hf_device_map` attribute allows you to access and view the `device_map`.
+
+```py
+print(transformer.hf_device_map)
+# {'': device(type='cuda')}
+```
+
+## Saving models
+
+Save a model with the [`~ModelMixin.save_pretrained`] method.
+
+```py
+from diffusers import QwenImageTransformer2DModel
+
+model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer")
+model.save_pretrained("./local/model")
+```
+
+For large models, it is helpful to use `max_shard_size` to save a model as multiple shards. A shard can be loaded faster and save memory (refer to the [parallel loading](./loading#parallel-loading) docs for more details), especially if there is more than one GPU.
+
+```py
+model.save_pretrained("./local/model", max_shard_size="5GB")
+```

From 8816f528823a53f753d5c925e4df5eba85a8c831 Mon Sep 17 00:00:00 2001
From: stevhliu <steven.liu@huggingface.co>
Date: Wed, 27 Aug 2025 15:49:12 -0700
Subject: [PATCH 2/4] fix

---
 docs/source/en/using-diffusers/models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/using-diffusers/models.md b/docs/source/en/using-diffusers/models.md
index 4207e92e6209..221742103cea 100644
--- a/docs/source/en/using-diffusers/models.md
+++ b/docs/source/en/using-diffusers/models.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 [[open-in-colab]]
 
-## Models
+# Models
 
 A diffusion model relies on a few individual models working together to generate an output. These models are responsible for denoising, encoding inputs, and decoding latents into the actual outputs.
 

From 89fc965addd0948f64bf9ea54df72e159a23708e Mon Sep 17 00:00:00 2001
From: stevhliu <steven.liu@huggingface.co>
Date: Tue, 2 Sep 2025 16:40:48 -0700
Subject: [PATCH 3/4] feedback

---
 docs/source/en/using-diffusers/models.md | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/using-diffusers/models.md b/docs/source/en/using-diffusers/models.md
index 221742103cea..22c78d490ae4 100644
--- a/docs/source/en/using-diffusers/models.md
+++ b/docs/source/en/using-diffusers/models.md
@@ -38,8 +38,7 @@ model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder
 from diffusers import AutoModel
 
 model = AutoModel.from_pretrained(
-    "Qwen/Qwen-Image",
-    subfolder="transformer"
+    "Qwen/Qwen-Image", subfolder="transformer"
 )
 ```
 
@@ -53,19 +52,18 @@ from diffusers import QwenImageTransformer2DModel
 
 model = QwenImageTransformer2DModel.from_pretrained(
     "Qwen/Qwen-Image",
-    subfolder="transformer"
-    torch_dtype=torch.float16
+    subfolder="transformer",
+    torch_dtype=torch.bfloat16
 )
 ```
 
-[torch.Tensor.to](https://docs.pytorch.org/docs/stable/generated/torch.Tensor.to.html) can also convert to a specific data type on the fly. However, it converts *all* weights to the requested data type unlike `torch_dtype` which respects `_keep_in_fp32_modules`. This argument preserves layers in `torch.float32` for numerical stability and best generation quality (see example [_keep_in_fp32_modules](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374))
+[nn.Module.to](https://docs.pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to) can also convert to a specific data type on the fly. However, it converts *all* weights to the requested data type unlike `torch_dtype` which respects `_keep_in_fp32_modules`. This argument preserves layers in `torch.float32` for numerical stability and best generation quality (see example [_keep_in_fp32_modules](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374))
 
 ```py
 from diffusers import QwenImageTransformer2DModel
 
 model = QwenImageTransformer2DModel.from_pretrained(
-    "Qwen/Qwen-Image",, 
-    subfolder="transformer"
+    "Qwen/Qwen-Image", subfolder="transformer"
 )
 model = model.to(dtype=torch.float16) 
 ```
@@ -78,7 +76,7 @@ Diffusers currently provides three options to `device_map` for individual models
 
 | parameter | description |
 |---|---|
-| `"cuda"` | places pipeline on CUDA device |
+| `"cuda"` | places pipeline on a supported accelerator (CUDA) |
 | `"balanced"` | evenly distributes pipeline on all GPUs |
 | `"auto"` | distribute model from fastest device first to slowest |
 
@@ -86,12 +84,11 @@ Use the `max_memory` argument in [`~ModelMixin.from_pretrained`] to allocate a m
 
 ```py
 import torch
-from diffusers import QwenImageTransformer2DModel
+from diffusers import QwenImagePipeline
 
 max_memory = {0: "16GB", 1: "16GB"}
-transformer = QwenImageTransformer2DModel.from_pretrained(
+pipeline = QwenImagePipeline.from_pretrained(
     "Qwen/Qwen-Image", 
-    subfolder="transformer",
     torch_dtype=torch.bfloat16,
     device_map="cuda",
     max_memory=max_memory

From c777df842aa262e783ef0df7805e973991b81797 Mon Sep 17 00:00:00 2001
From: stevhliu <steven.liu@huggingface.co>
Date: Fri, 5 Sep 2025 09:41:37 -0700
Subject: [PATCH 4/4] feedback

---
 docs/source/en/using-diffusers/loading.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/using-diffusers/loading.md b/docs/source/en/using-diffusers/loading.md
index f1a997889061..25b53d2f4d49 100644
--- a/docs/source/en/using-diffusers/loading.md
+++ b/docs/source/en/using-diffusers/loading.md
@@ -112,7 +112,7 @@ A pipeline supports two options for `device_map`, `"cuda"` and `"balanced"`. Ref
 
 | parameter | description |
 |---|---|
-| `"cuda"` | places pipeline on CUDA device |
+| `"cuda"` | places pipeline on a supported accelerator device like CUDA |
 | `"balanced"` | evenly distributes pipeline on all GPUs |
 
 Use the `max_memory` argument in [`~DiffusionPipeline.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.