Skip to content

Commit 4c3a4fa

Browse files
authored
Merge branch 'main' into sdxl_controlnet_inpaint_and_img2img_callback_tensor_inputs
2 parents 4b437c3 + cc7b5b8 commit 4c3a4fa

File tree

17 files changed

+718
-323
lines changed

17 files changed

+718
-323
lines changed

.github/workflows/pr_tests_gpu.yml

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
name: Fast GPU Tests on PR
2+
3+
on:
4+
pull_request:
5+
branches: main
6+
paths:
7+
- "src/diffusers/models/modeling_utils.py"
8+
- "src/diffusers/models/model_loading_utils.py"
9+
- "src/diffusers/pipelines/pipeline_utils.py"
10+
- "src/diffusers/pipeline_loading_utils.py"
11+
- "src/diffusers/loaders/lora_base.py"
12+
- "src/diffusers/loaders/lora_pipeline.py"
13+
- "src/diffusers/loaders/peft.py"
14+
workflow_dispatch:
15+
16+
concurrency:
17+
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
18+
cancel-in-progress: true
19+
20+
env:
21+
DIFFUSERS_IS_CI: yes
22+
OMP_NUM_THREADS: 8
23+
MKL_NUM_THREADS: 8
24+
HF_HUB_ENABLE_HF_TRANSFER: 1
25+
PYTEST_TIMEOUT: 600
26+
PIPELINE_USAGE_CUTOFF: 1000000000 # set high cutoff so that only always-test pipelines run
27+
28+
jobs:
29+
setup_torch_cuda_pipeline_matrix:
30+
name: Setup Torch Pipelines CUDA Slow Tests Matrix
31+
runs-on:
32+
group: aws-general-8-plus
33+
container:
34+
image: diffusers/diffusers-pytorch-cpu
35+
outputs:
36+
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
37+
steps:
38+
- name: Checkout diffusers
39+
uses: actions/checkout@v3
40+
with:
41+
fetch-depth: 2
42+
- name: Install dependencies
43+
run: |
44+
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
45+
python -m uv pip install -e [quality,test]
46+
- name: Environment
47+
run: |
48+
python utils/print_env.py
49+
- name: Fetch Pipeline Matrix
50+
id: fetch_pipeline_matrix
51+
run: |
52+
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
53+
echo $matrix
54+
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
55+
- name: Pipeline Tests Artifacts
56+
if: ${{ always() }}
57+
uses: actions/upload-artifact@v4
58+
with:
59+
name: test-pipelines.json
60+
path: reports
61+
62+
torch_pipelines_cuda_tests:
63+
name: Torch Pipelines CUDA Tests
64+
needs: setup_torch_cuda_pipeline_matrix
65+
strategy:
66+
fail-fast: false
67+
max-parallel: 8
68+
matrix:
69+
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
70+
runs-on:
71+
group: aws-g4dn-2xlarge
72+
container:
73+
image: diffusers/diffusers-pytorch-cuda
74+
options: --shm-size "16gb" --ipc host --gpus 0
75+
steps:
76+
- name: Checkout diffusers
77+
uses: actions/checkout@v3
78+
with:
79+
fetch-depth: 2
80+
81+
- name: NVIDIA-SMI
82+
run: |
83+
nvidia-smi
84+
- name: Install dependencies
85+
run: |
86+
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
87+
python -m uv pip install -e [quality,test]
88+
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
89+
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
90+
91+
- name: Environment
92+
run: |
93+
python utils/print_env.py
94+
- name: Extract tests
95+
id: extract_tests
96+
run: |
97+
pattern=$(python utils/extract_tests_from_mixin.py --type pipeline)
98+
echo "$pattern" > /tmp/test_pattern.txt
99+
echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
100+
101+
- name: PyTorch CUDA checkpoint tests on Ubuntu
102+
env:
103+
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
104+
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
105+
CUBLAS_WORKSPACE_CONFIG: :16:8
106+
run: |
107+
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
108+
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
109+
-s -v -k "not Flax and not Onnx and $pattern" \
110+
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
111+
tests/pipelines/${{ matrix.module }}
112+
113+
- name: Failure short reports
114+
if: ${{ failure() }}
115+
run: |
116+
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
117+
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
118+
- name: Test suite reports artifacts
119+
if: ${{ always() }}
120+
uses: actions/upload-artifact@v4
121+
with:
122+
name: pipeline_${{ matrix.module }}_test_reports
123+
path: reports
124+
125+
torch_cuda_tests:
126+
name: Torch CUDA Tests
127+
runs-on:
128+
group: aws-g4dn-2xlarge
129+
container:
130+
image: diffusers/diffusers-pytorch-cuda
131+
options: --shm-size "16gb" --ipc host --gpus 0
132+
defaults:
133+
run:
134+
shell: bash
135+
strategy:
136+
fail-fast: false
137+
max-parallel: 2
138+
matrix:
139+
module: [models, schedulers, lora, others]
140+
steps:
141+
- name: Checkout diffusers
142+
uses: actions/checkout@v3
143+
with:
144+
fetch-depth: 2
145+
146+
- name: Install dependencies
147+
run: |
148+
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
149+
python -m uv pip install -e [quality,test]
150+
python -m uv pip install peft@git+https://github.com/huggingface/peft.git
151+
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
152+
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
153+
154+
- name: Environment
155+
run: |
156+
python utils/print_env.py
157+
158+
- name: Extract tests
159+
id: extract_tests
160+
run: |
161+
pattern=$(python utils/extract_tests_from_mixin.py --type ${{ matrix.module }})
162+
echo "$pattern" > /tmp/test_pattern.txt
163+
echo "pattern_file=/tmp/test_pattern.txt" >> $GITHUB_OUTPUT
164+
165+
- name: Run PyTorch CUDA tests
166+
env:
167+
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
168+
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
169+
CUBLAS_WORKSPACE_CONFIG: :16:8
170+
run: |
171+
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
172+
if [ -z "$pattern" ]; then
173+
python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
174+
--make-reports=tests_torch_cuda_${{ matrix.module }}
175+
else
176+
python -m pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
177+
--make-reports=tests_torch_cuda_${{ matrix.module }}
178+
fi
179+
180+
- name: Failure short reports
181+
if: ${{ failure() }}
182+
run: |
183+
cat reports/tests_torch_cuda_${{ matrix.module }}_stats.txt
184+
cat reports/tests_torch_cuda_${{ matrix.module }}_failures_short.txt
185+
186+
- name: Test suite reports artifacts
187+
if: ${{ always() }}
188+
uses: actions/upload-artifact@v4
189+
with:
190+
name: torch_cuda_test_reports_${{ matrix.module }}
191+
path: reports
192+
193+
run_examples_tests:
194+
name: Examples PyTorch CUDA tests on Ubuntu
195+
pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
196+
runs-on:
197+
group: aws-g4dn-2xlarge
198+
199+
container:
200+
image: diffusers/diffusers-pytorch-cuda
201+
options: --gpus 0 --shm-size "16gb" --ipc host
202+
steps:
203+
- name: Checkout diffusers
204+
uses: actions/checkout@v3
205+
with:
206+
fetch-depth: 2
207+
208+
- name: NVIDIA-SMI
209+
run: |
210+
nvidia-smi
211+
- name: Install dependencies
212+
run: |
213+
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
214+
python -m uv pip install -e [quality,test,training]
215+
216+
- name: Environment
217+
run: |
218+
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
219+
python utils/print_env.py
220+
221+
- name: Run example tests on GPU
222+
env:
223+
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
224+
run: |
225+
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
226+
python -m uv pip install timm
227+
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
228+
229+
- name: Failure short reports
230+
if: ${{ failure() }}
231+
run: |
232+
cat reports/examples_torch_cuda_stats.txt
233+
cat reports/examples_torch_cuda_failures_short.txt
234+
235+
- name: Test suite reports artifacts
236+
if: ${{ always() }}
237+
uses: actions/upload-artifact@v4
238+
with:
239+
name: examples_test_reports
240+
path: reports
241+

.github/workflows/push_tests.yml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,6 @@
11
name: Fast GPU Tests on main
22

33
on:
4-
pull_request:
5-
branches: main
6-
paths:
7-
- "src/diffusers/models/modeling_utils.py"
8-
- "src/diffusers/models/model_loading_utils.py"
9-
- "src/diffusers/pipelines/pipeline_utils.py"
10-
- "src/diffusers/pipeline_loading_utils.py"
114
workflow_dispatch:
125
push:
136
branches:
@@ -167,7 +160,6 @@ jobs:
167160
path: reports
168161

169162
flax_tpu_tests:
170-
if: ${{ github.event_name != 'pull_request' }}
171163
name: Flax TPU Tests
172164
runs-on:
173165
group: gcp-ct5lp-hightpu-8t
@@ -216,7 +208,6 @@ jobs:
216208
path: reports
217209

218210
onnx_cuda_tests:
219-
if: ${{ github.event_name != 'pull_request' }}
220211
name: ONNX CUDA Tests
221212
runs-on:
222213
group: aws-g4dn-2xlarge
@@ -265,7 +256,6 @@ jobs:
265256
path: reports
266257

267258
run_torch_compile_tests:
268-
if: ${{ github.event_name != 'pull_request' }}
269259
name: PyTorch Compile CUDA tests
270260

271261
runs-on:
@@ -309,7 +299,6 @@ jobs:
309299
path: reports
310300

311301
run_xformers_tests:
312-
if: ${{ github.event_name != 'pull_request' }}
313302
name: PyTorch xformers CUDA tests
314303

315304
runs-on:

docs/source/en/_toctree.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,10 @@
543543
title: Overview
544544
- local: api/schedulers/cm_stochastic_iterative
545545
title: CMStochasticIterativeScheduler
546+
- local: api/schedulers/ddim_cogvideox
547+
title: CogVideoXDDIMScheduler
548+
- local: api/schedulers/multistep_dpm_solver_cogvideox
549+
title: CogVideoXDPMScheduler
546550
- local: api/schedulers/consistency_decoder
547551
title: ConsistencyDecoderScheduler
548552
- local: api/schedulers/cosine_dpm

docs/source/en/api/pipelines/flux.md

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -359,8 +359,74 @@ image.save('flux_ip_adapter_output.jpg')
359359
<figcaption class="mt-2 text-sm text-center text-gray-500">IP-Adapter examples with prompt "wearing sunglasses"</figcaption>
360360
</div>
361361

362+
## Optimize
362363

363-
## Running FP16 inference
364+
Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeling components. Enable some of the optimizations below to lower the memory requirements.
365+
366+
### Group offloading
367+
368+
[Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level.
369+
370+
On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
371+
372+
> [!TIP]
373+
> It is possible to mix block and leaf-level offloading for different components in a pipeline.
374+
375+
```py
376+
import torch
377+
from diffusers import FluxPipeline
378+
from diffusers.hooks import apply_group_offloading
379+
380+
model_id = "black-forest-labs/FLUX.1-dev"
381+
dtype = torch.bfloat16
382+
pipe = FluxPipeline.from_pretrained(
383+
model_id,
384+
torch_dtype=dtype,
385+
)
386+
387+
apply_group_offloading(
388+
pipe.transformer,
389+
offload_type="leaf_level",
390+
offload_device=torch.device("cpu"),
391+
onload_device=torch.device("cuda"),
392+
use_stream=True,
393+
)
394+
apply_group_offloading(
395+
pipe.text_encoder,
396+
offload_device=torch.device("cpu"),
397+
onload_device=torch.device("cuda"),
398+
offload_type="leaf_level",
399+
use_stream=True,
400+
)
401+
apply_group_offloading(
402+
pipe.text_encoder_2,
403+
offload_device=torch.device("cpu"),
404+
onload_device=torch.device("cuda"),
405+
offload_type="leaf_level",
406+
use_stream=True,
407+
)
408+
apply_group_offloading(
409+
pipe.vae,
410+
offload_device=torch.device("cpu"),
411+
onload_device=torch.device("cuda"),
412+
offload_type="leaf_level",
413+
use_stream=True,
414+
)
415+
416+
prompt="A cat wearing sunglasses and working as a lifeguard at pool."
417+
418+
generator = torch.Generator().manual_seed(181201)
419+
image = pipe(
420+
prompt,
421+
width=576,
422+
height=1024,
423+
num_inference_steps=30,
424+
generator=generator
425+
).images[0]
426+
image
427+
```
428+
429+
### Running FP16 inference
364430

365431
Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
366432

@@ -389,7 +455,7 @@ out = pipe(
389455
out.save("image.png")
390456
```
391457

392-
## Quantization
458+
### Quantization
393459

394460
Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.
395461

0 commit comments

Comments
 (0)