feedback

stevhliu · stevhliu · commit 43273e287f10 · 2025-08-14T10:20:33.000-07:00
diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
@@ -53,9 +53,8 @@ import torch
 from diffusers import DiffusionPipeline
 
 pipeline = DiffusionPipeline.from_pretrained(
-  "Qwen/Qwen-Image",
-  torch_dtype=torch.bfloat16
-).to("cuda")
+  "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
+)
 
 prompt = """
 cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
@@ -84,7 +83,8 @@ pipeline = DiffusionPipeline.from_pretrained(
   "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
   vae=vae
   torch_dtype=torch.bfloat16,
-).to("cuda")
+  device_map="cuda"
+)
 
 prompt = """
 Cinematic video of a sleek cat lounging on a colorful inflatable in a crystal-clear turquoise pool in Palm Springs, 
@@ -110,13 +110,11 @@ import torch
 from diffusers import DiffusionPipeline
 
 pipeline = DiffusionPipeline.from_pretrained(
-  "Qwen/Qwen-Image",
-  torch_dtype=torch.bfloat16
+  "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
 )
 pipeline.load_lora_weights(
   "flymy-ai/qwen-image-realism-lora",
 )
-pipeline.to("cuda")
 
 prompt = """
 super Realism cinematic film still of a cat sipping a margarita in a pool in Palm Springs in the style of umempart, California
@@ -149,7 +147,8 @@ pipeline = DiffusionPipeline.from_pretrained(
   "Qwen/Qwen-Image",
   torch_dtype=torch.bfloat16,
   quantization_config=quant_config,
-).to("cuda")
+  device_map="cuda"
+)
 
 prompt = """
 cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
@@ -187,7 +186,8 @@ pipeline = DiffusionPipeline.from_pretrained(
   "Qwen/Qwen-Image",
   torch_dtype=torch.bfloat16,
   quantization_config=quant_config,
-).to("cuda")
+  device_map="cuda"
+)
 pipeline.enable_model_cpu_offload()
 
 prompt = """
@@ -213,9 +213,8 @@ import torch
 from diffusers import DiffusionPipeline
 
 pipeline = DiffusionPipeline.from_pretrained(
-  "Qwen/Qwen-Image",
-  torch_dtype=torch.bfloat16
-).to("cuda")
+  "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
+)
 
 pipeline.transformer.compile_repeated_blocks(
     fullgraph=True,
diff --git a/docs/source/en/stable_diffusion.md b/docs/source/en/stable_diffusion.md
@@ -22,14 +22,17 @@ This guide recommends some basic performance tips for using the [`DiffusionPipel
 
 Reducing the amount of memory used indirectly speeds up generation and can help a model fit on device.
 
+The [`~DiffusionPipeline.enable_model_cpu_offload`] method moves a model to the CPU when it is not in use to save GPU memory.
+
 ```py
 import torch
 from diffusers import DiffusionPipeline
 
 pipeline = DiffusionPipeline.from_pretrained(
   "stabilityai/stable-diffusion-xl-base-1.0",
-  torch_dtype=torch.bfloat16
-).to("cuda")
+  torch_dtype=torch.bfloat16,
+  device_map="cuda"
+)
 pipeline.enable_model_cpu_offload()
 
 prompt = """
@@ -44,7 +47,7 @@ print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} G
 
 Denoising is the most computationally demanding process during diffusion. Methods that optimizes this process accelerates inference speed. Try the following methods for a speed up.
 
-- Add `.to("cuda")` to place the pipeline on a GPU. Placing a model on an accelerator, like a GPU, increases speed because it performs computations in parallel.
+- Add `device_map="cuda"` to place the pipeline on a GPU. Placing a model on an accelerator, like a GPU, increases speed because it performs computations in parallel.
 - Set `torch_dtype=torch.bfloat16` to execute the pipeline in half-precision. Reducing the data type precision increases speed because it takes less time to perform computations in a lower precision.
 
 ```py
@@ -54,8 +57,9 @@ from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
 
 pipeline = DiffusionPipeline.from_pretrained(
   "stabilityai/stable-diffusion-xl-base-1.0",
-  torch_dtype=torch.bfloat16
-).to("cuda")
+  torch_dtype=torch.bfloat16,
+  device_map="cuda
+)
 ```
 
 - Use a faster scheduler, such as [`DPMSolverMultistepScheduler`], which only requires ~20-25 steps.
@@ -88,8 +92,9 @@ Many modern diffusion models deliver high-quality images out-of-the-box. However
 
     pipeline = DiffusionPipeline.from_pretrained(
         "stabilityai/stable-diffusion-xl-base-1.0",
-        torch_dtype=torch.bfloat16
-    ).to("cuda")
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )
 
     prompt = """
     cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
@@ -109,8 +114,9 @@ Many modern diffusion models deliver high-quality images out-of-the-box. However
 
     pipeline = DiffusionPipeline.from_pretrained(
         "stabilityai/stable-diffusion-xl-base-1.0",
-        torch_dtype=torch.bfloat16
-    ).to("cuda")
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )
     pipeline.scheduler = HeunDiscreteScheduler.from_config(pipeline.scheduler.config)
 
     prompt = """