Skip to content

Commit eae7b9e

Browse files
authored
Merge branch 'main' into unet3d
2 parents 23b128e + 8ead643 commit eae7b9e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+818
-134
lines changed

docs/source/en/_toctree.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@
8181
title: Overview
8282
- local: hybrid_inference/vae_decode
8383
title: VAE Decode
84+
- local: hybrid_inference/vae_encode
85+
title: VAE Encode
8486
- local: hybrid_inference/api_reference
8587
title: API Reference
8688
title: Hybrid Inference

docs/source/en/api/pipelines/lumina.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,10 @@ Use [`torch.compile`](https://huggingface.co/docs/diffusers/main/en/tutorials/fa
5858
First, load the pipeline:
5959

6060
```python
61-
from diffusers import LuminaText2ImgPipeline
61+
from diffusers import LuminaPipeline
6262
import torch
6363

64-
pipeline = LuminaText2ImgPipeline.from_pretrained(
64+
pipeline = LuminaPipeline.from_pretrained(
6565
"Alpha-VLLM/Lumina-Next-SFT-diffusers", torch_dtype=torch.bfloat16
6666
).to("cuda")
6767
```
@@ -86,11 +86,11 @@ image = pipeline(prompt="Upper body of a young woman in a Victorian-era outfit w
8686

8787
Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.
8888

89-
Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`LuminaText2ImgPipeline`] for inference with bitsandbytes.
89+
Refer to the [Quantization](../../quantization/overview) overview to learn more about supported quantization backends and selecting a quantization backend that supports your use case. The example below demonstrates how to load a quantized [`LuminaPipeline`] for inference with bitsandbytes.
9090

9191
```py
9292
import torch
93-
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, Transformer2DModel, LuminaText2ImgPipeline
93+
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, Transformer2DModel, LuminaPipeline
9494
from transformers import BitsAndBytesConfig as BitsAndBytesConfig, T5EncoderModel
9595

9696
quant_config = BitsAndBytesConfig(load_in_8bit=True)
@@ -109,7 +109,7 @@ transformer_8bit = Transformer2DModel.from_pretrained(
109109
torch_dtype=torch.float16,
110110
)
111111

112-
pipeline = LuminaText2ImgPipeline.from_pretrained(
112+
pipeline = LuminaPipeline.from_pretrained(
113113
"Alpha-VLLM/Lumina-Next-SFT-diffusers",
114114
text_encoder=text_encoder_8bit,
115115
transformer=transformer_8bit,
@@ -122,9 +122,9 @@ image = pipeline(prompt).images[0]
122122
image.save("lumina.png")
123123
```
124124

125-
## LuminaText2ImgPipeline
125+
## LuminaPipeline
126126

127-
[[autodoc]] LuminaText2ImgPipeline
127+
[[autodoc]] LuminaPipeline
128128
- all
129129
- __call__
130130

docs/source/en/api/pipelines/lumina2.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@ Single file loading for Lumina Image 2.0 is available for the `Lumina2Transforme
3636

3737
```python
3838
import torch
39-
from diffusers import Lumina2Transformer2DModel, Lumina2Text2ImgPipeline
39+
from diffusers import Lumina2Transformer2DModel, Lumina2Pipeline
4040

4141
ckpt_path = "https://huggingface.co/Alpha-VLLM/Lumina-Image-2.0/blob/main/consolidated.00-of-01.pth"
4242
transformer = Lumina2Transformer2DModel.from_single_file(
4343
ckpt_path, torch_dtype=torch.bfloat16
4444
)
4545

46-
pipe = Lumina2Text2ImgPipeline.from_pretrained(
46+
pipe = Lumina2Pipeline.from_pretrained(
4747
"Alpha-VLLM/Lumina-Image-2.0", transformer=transformer, torch_dtype=torch.bfloat16
4848
)
4949
pipe.enable_model_cpu_offload()
@@ -60,7 +60,7 @@ image.save("lumina-single-file.png")
6060
GGUF Quantized checkpoints for the `Lumina2Transformer2DModel` can be loaded via `from_single_file` with the `GGUFQuantizationConfig`
6161

6262
```python
63-
from diffusers import Lumina2Transformer2DModel, Lumina2Text2ImgPipeline, GGUFQuantizationConfig
63+
from diffusers import Lumina2Transformer2DModel, Lumina2Pipeline, GGUFQuantizationConfig
6464

6565
ckpt_path = "https://huggingface.co/calcuis/lumina-gguf/blob/main/lumina2-q4_0.gguf"
6666
transformer = Lumina2Transformer2DModel.from_single_file(
@@ -69,7 +69,7 @@ transformer = Lumina2Transformer2DModel.from_single_file(
6969
torch_dtype=torch.bfloat16,
7070
)
7171

72-
pipe = Lumina2Text2ImgPipeline.from_pretrained(
72+
pipe = Lumina2Pipeline.from_pretrained(
7373
"Alpha-VLLM/Lumina-Image-2.0", transformer=transformer, torch_dtype=torch.bfloat16
7474
)
7575
pipe.enable_model_cpu_offload()
@@ -80,8 +80,8 @@ image = pipe(
8080
image.save("lumina-gguf.png")
8181
```
8282

83-
## Lumina2Text2ImgPipeline
83+
## Lumina2Pipeline
8484

85-
[[autodoc]] Lumina2Text2ImgPipeline
85+
[[autodoc]] Lumina2Pipeline
8686
- all
8787
- __call__

docs/source/en/hybrid_inference/api_reference.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,7 @@
33
## Remote Decode
44

55
[[autodoc]] utils.remote_utils.remote_decode
6+
7+
## Remote Encode
8+
9+
[[autodoc]] utils.remote_utils.remote_encode

docs/source/en/hybrid_inference/overview.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ Hybrid Inference offers a fast and simple way to offload local generation requir
3636
## Available Models
3737

3838
* **VAE Decode 🖼️:** Quickly decode latent representations into high-quality images without compromising performance or workflow speed.
39-
* **VAE Encode 🔢 (coming soon):** Efficiently encode images into latent representations for generation and training.
39+
* **VAE Encode 🔢:** Efficiently encode images into latent representations for generation and training.
4040
* **Text Encoders 📃 (coming soon):** Compute text embeddings for your prompts quickly and accurately, ensuring a smooth and high-quality workflow.
4141

4242
---
@@ -46,9 +46,15 @@ Hybrid Inference offers a fast and simple way to offload local generation requir
4646
* **[SD.Next](https://github.com/vladmandic/sdnext):** All-in-one UI with direct supports Hybrid Inference.
4747
* **[ComfyUI-HFRemoteVae](https://github.com/kijai/ComfyUI-HFRemoteVae):** ComfyUI node for Hybrid Inference.
4848

49+
## Changelog
50+
51+
- March 10 2025: Added VAE encode
52+
- March 2 2025: Initial release with VAE decoding
53+
4954
## Contents
5055

51-
The documentation is organized into two sections:
56+
The documentation is organized into three sections:
5257

5358
* **VAE Decode** Learn the basics of how to use VAE Decode with Hybrid Inference.
59+
* **VAE Encode** Learn the basics of how to use VAE Encode with Hybrid Inference.
5460
* **API Reference** Dive into task-specific settings and parameters.
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
# Getting Started: VAE Encode with Hybrid Inference
2+
3+
VAE encode is used for training, image-to-image and image-to-video - turning into images or videos into latent representations.
4+
5+
## Memory
6+
7+
These tables demonstrate the VRAM requirements for VAE encode with SD v1 and SD XL on different GPUs.
8+
9+
For the majority of these GPUs the memory usage % dictates other models (text encoders, UNet/Transformer) must be offloaded, or tiled encoding has to be used which increases time taken and impacts quality.
10+
11+
<details><summary>SD v1.5</summary>
12+
13+
| GPU | Resolution | Time (seconds) | Memory (%) | Tiled Time (secs) | Tiled Memory (%) |
14+
|:------------------------------|:-------------|-----------------:|-------------:|--------------------:|-------------------:|
15+
| NVIDIA GeForce RTX 4090 | 512x512 | 0.015 | 3.51901 | 0.015 | 3.51901 |
16+
| NVIDIA GeForce RTX 4090 | 256x256 | 0.004 | 1.3154 | 0.005 | 1.3154 |
17+
| NVIDIA GeForce RTX 4090 | 2048x2048 | 0.402 | 47.1852 | 0.496 | 3.51901 |
18+
| NVIDIA GeForce RTX 4090 | 1024x1024 | 0.078 | 12.2658 | 0.094 | 3.51901 |
19+
| NVIDIA GeForce RTX 4080 SUPER | 512x512 | 0.023 | 5.30105 | 0.023 | 5.30105 |
20+
| NVIDIA GeForce RTX 4080 SUPER | 256x256 | 0.006 | 1.98152 | 0.006 | 1.98152 |
21+
| NVIDIA GeForce RTX 4080 SUPER | 2048x2048 | 0.574 | 71.08 | 0.656 | 5.30105 |
22+
| NVIDIA GeForce RTX 4080 SUPER | 1024x1024 | 0.111 | 18.4772 | 0.14 | 5.30105 |
23+
| NVIDIA GeForce RTX 3090 | 512x512 | 0.032 | 3.52782 | 0.032 | 3.52782 |
24+
| NVIDIA GeForce RTX 3090 | 256x256 | 0.01 | 1.31869 | 0.009 | 1.31869 |
25+
| NVIDIA GeForce RTX 3090 | 2048x2048 | 0.742 | 47.3033 | 0.954 | 3.52782 |
26+
| NVIDIA GeForce RTX 3090 | 1024x1024 | 0.136 | 12.2965 | 0.207 | 3.52782 |
27+
| NVIDIA GeForce RTX 3080 | 512x512 | 0.036 | 8.51761 | 0.036 | 8.51761 |
28+
| NVIDIA GeForce RTX 3080 | 256x256 | 0.01 | 3.18387 | 0.01 | 3.18387 |
29+
| NVIDIA GeForce RTX 3080 | 2048x2048 | 0.863 | 86.7424 | 1.191 | 8.51761 |
30+
| NVIDIA GeForce RTX 3080 | 1024x1024 | 0.157 | 29.6888 | 0.227 | 8.51761 |
31+
| NVIDIA GeForce RTX 3070 | 512x512 | 0.051 | 10.6941 | 0.051 | 10.6941 |
32+
| NVIDIA GeForce RTX 3070 | 256x256 | 0.015 | 3.99743 | 0.015 | 3.99743 |
33+
| NVIDIA GeForce RTX 3070 | 2048x2048 | 1.217 | 96.054 | 1.482 | 10.6941 |
34+
| NVIDIA GeForce RTX 3070 | 1024x1024 | 0.223 | 37.2751 | 0.327 | 10.6941 |
35+
36+
37+
</details>
38+
39+
<details><summary>SDXL</summary>
40+
41+
| GPU | Resolution | Time (seconds) | Memory Consumed (%) | Tiled Time (seconds) | Tiled Memory (%) |
42+
|:------------------------------|:-------------|-----------------:|----------------------:|-----------------------:|-------------------:|
43+
| NVIDIA GeForce RTX 4090 | 512x512 | 0.029 | 4.95707 | 0.029 | 4.95707 |
44+
| NVIDIA GeForce RTX 4090 | 256x256 | 0.007 | 2.29666 | 0.007 | 2.29666 |
45+
| NVIDIA GeForce RTX 4090 | 2048x2048 | 0.873 | 66.3452 | 0.863 | 15.5649 |
46+
| NVIDIA GeForce RTX 4090 | 1024x1024 | 0.142 | 15.5479 | 0.143 | 15.5479 |
47+
| NVIDIA GeForce RTX 4080 SUPER | 512x512 | 0.044 | 7.46735 | 0.044 | 7.46735 |
48+
| NVIDIA GeForce RTX 4080 SUPER | 256x256 | 0.01 | 3.4597 | 0.01 | 3.4597 |
49+
| NVIDIA GeForce RTX 4080 SUPER | 2048x2048 | 1.317 | 87.1615 | 1.291 | 23.447 |
50+
| NVIDIA GeForce RTX 4080 SUPER | 1024x1024 | 0.213 | 23.4215 | 0.214 | 23.4215 |
51+
| NVIDIA GeForce RTX 3090 | 512x512 | 0.058 | 5.65638 | 0.058 | 5.65638 |
52+
| NVIDIA GeForce RTX 3090 | 256x256 | 0.016 | 2.45081 | 0.016 | 2.45081 |
53+
| NVIDIA GeForce RTX 3090 | 2048x2048 | 1.755 | 77.8239 | 1.614 | 18.4193 |
54+
| NVIDIA GeForce RTX 3090 | 1024x1024 | 0.265 | 18.4023 | 0.265 | 18.4023 |
55+
| NVIDIA GeForce RTX 3080 | 512x512 | 0.064 | 13.6568 | 0.064 | 13.6568 |
56+
| NVIDIA GeForce RTX 3080 | 256x256 | 0.018 | 5.91728 | 0.018 | 5.91728 |
57+
| NVIDIA GeForce RTX 3080 | 2048x2048 | OOM | OOM | 1.866 | 44.4717 |
58+
| NVIDIA GeForce RTX 3080 | 1024x1024 | 0.302 | 44.4308 | 0.302 | 44.4308 |
59+
| NVIDIA GeForce RTX 3070 | 512x512 | 0.093 | 17.1465 | 0.093 | 17.1465 |
60+
| NVIDIA GeForce RTX 3070 | 256x256 | 0.025 | 7.42931 | 0.026 | 7.42931 |
61+
| NVIDIA GeForce RTX 3070 | 2048x2048 | OOM | OOM | 2.674 | 55.8355 |
62+
| NVIDIA GeForce RTX 3070 | 1024x1024 | 0.443 | 55.7841 | 0.443 | 55.7841 |
63+
64+
</details>
65+
66+
## Available VAEs
67+
68+
| | **Endpoint** | **Model** |
69+
|:-:|:-----------:|:--------:|
70+
| **Stable Diffusion v1** | [https://qc6479g0aac6qwy9.us-east-1.aws.endpoints.huggingface.cloud](https://qc6479g0aac6qwy9.us-east-1.aws.endpoints.huggingface.cloud) | [`stabilityai/sd-vae-ft-mse`](https://hf.co/stabilityai/sd-vae-ft-mse) |
71+
| **Stable Diffusion XL** | [https://xjqqhmyn62rog84g.us-east-1.aws.endpoints.huggingface.cloud](https://xjqqhmyn62rog84g.us-east-1.aws.endpoints.huggingface.cloud) | [`madebyollin/sdxl-vae-fp16-fix`](https://hf.co/madebyollin/sdxl-vae-fp16-fix) |
72+
| **Flux** | [https://ptccx55jz97f9zgo.us-east-1.aws.endpoints.huggingface.cloud](https://ptccx55jz97f9zgo.us-east-1.aws.endpoints.huggingface.cloud) | [`black-forest-labs/FLUX.1-schnell`](https://hf.co/black-forest-labs/FLUX.1-schnell) |
73+
74+
75+
> [!TIP]
76+
> Model support can be requested [here](https://github.com/huggingface/diffusers/issues/new?template=remote-vae-pilot-feedback.yml).
77+
78+
79+
## Code
80+
81+
> [!TIP]
82+
> Install `diffusers` from `main` to run the code: `pip install git+https://github.com/huggingface/diffusers@main`
83+
84+
85+
A helper method simplifies interacting with Hybrid Inference.
86+
87+
```python
88+
from diffusers.utils.remote_utils import remote_encode
89+
```
90+
91+
### Basic example
92+
93+
Let's encode an image, then decode it to demonstrate.
94+
95+
<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
96+
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"/>
97+
</figure>
98+
99+
<details><summary>Code</summary>
100+
101+
```python
102+
from diffusers.utils import load_image
103+
from diffusers.utils.remote_utils import remote_decode
104+
105+
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg?download=true")
106+
107+
latent = remote_encode(
108+
endpoint="https://ptccx55jz97f9zgo.us-east-1.aws.endpoints.huggingface.cloud/",
109+
scaling_factor=0.3611,
110+
shift_factor=0.1159,
111+
)
112+
113+
decoded = remote_decode(
114+
endpoint="https://whhx50ex1aryqvw6.us-east-1.aws.endpoints.huggingface.cloud/",
115+
tensor=latent,
116+
scaling_factor=0.3611,
117+
shift_factor=0.1159,
118+
)
119+
```
120+
121+
</details>
122+
123+
<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
124+
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/decoded.png"/>
125+
</figure>
126+
127+
128+
### Generation
129+
130+
Now let's look at a generation example, we'll encode the image, generate then remotely decode too!
131+
132+
<details><summary>Code</summary>
133+
134+
```python
135+
import torch
136+
from diffusers import StableDiffusionImg2ImgPipeline
137+
from diffusers.utils import load_image
138+
from diffusers.utils.remote_utils import remote_decode, remote_encode
139+
140+
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
141+
"stable-diffusion-v1-5/stable-diffusion-v1-5",
142+
torch_dtype=torch.float16,
143+
variant="fp16",
144+
vae=None,
145+
).to("cuda")
146+
147+
init_image = load_image(
148+
"https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
149+
)
150+
init_image = init_image.resize((768, 512))
151+
152+
init_latent = remote_encode(
153+
endpoint="https://qc6479g0aac6qwy9.us-east-1.aws.endpoints.huggingface.cloud/",
154+
image=init_image,
155+
scaling_factor=0.18215,
156+
)
157+
158+
prompt = "A fantasy landscape, trending on artstation"
159+
latent = pipe(
160+
prompt=prompt,
161+
image=init_latent,
162+
strength=0.75,
163+
output_type="latent",
164+
).images
165+
166+
image = remote_decode(
167+
endpoint="https://q1bj3bpq6kzilnsu.us-east-1.aws.endpoints.huggingface.cloud/",
168+
tensor=latent,
169+
scaling_factor=0.18215,
170+
)
171+
image.save("fantasy_landscape.jpg")
172+
```
173+
174+
</details>
175+
176+
<figure class="image flex flex-col items-center justify-center text-center m-0 w-full">
177+
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/remote_vae/fantasy_landscape.png"/>
178+
</figure>
179+
180+
## Integrations
181+
182+
* **[SD.Next](https://github.com/vladmandic/sdnext):** All-in-one UI with direct supports Hybrid Inference.
183+
* **[ComfyUI-HFRemoteVae](https://github.com/kijai/ComfyUI-HFRemoteVae):** ComfyUI node for Hybrid Inference.

examples/controlnet/train_controlnet.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,7 @@ def log_validation(
152152
validation_prompt = log["validation_prompt"]
153153
validation_image = log["validation_image"]
154154

155-
formatted_images = []
156-
157-
formatted_images.append(np.asarray(validation_image))
155+
formatted_images = [np.asarray(validation_image)]
158156

159157
for image in images:
160158
formatted_images.append(np.asarray(image))

examples/controlnet/train_controlnet_flux.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,7 @@ def log_validation(
166166
validation_prompt = log["validation_prompt"]
167167
validation_image = log["validation_image"]
168168

169-
formatted_images = []
170-
171-
formatted_images.append(np.asarray(validation_image))
169+
formatted_images = [np.asarray(validation_image)]
172170

173171
for image in images:
174172
formatted_images.append(np.asarray(image))

examples/controlnet/train_controlnet_sd3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1283,8 +1283,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
12831283
noisy_model_input = (1.0 - sigmas) * model_input + sigmas * noise
12841284

12851285
# Get the text embedding for conditioning
1286-
prompt_embeds = batch["prompt_embeds"]
1287-
pooled_prompt_embeds = batch["pooled_prompt_embeds"]
1286+
prompt_embeds = batch["prompt_embeds"].to(dtype=weight_dtype)
1287+
pooled_prompt_embeds = batch["pooled_prompt_embeds"].to(dtype=weight_dtype)
12881288

12891289
# controlnet(s) inference
12901290
controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)

examples/controlnet/train_controlnet_sdxl.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,7 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,
157157
validation_prompt = log["validation_prompt"]
158158
validation_image = log["validation_image"]
159159

160-
formatted_images = []
161-
162-
formatted_images.append(np.asarray(validation_image))
160+
formatted_images = [np.asarray(validation_image)]
163161

164162
for image in images:
165163
formatted_images.append(np.asarray(image))

0 commit comments

Comments
 (0)