Skip to content

Commit 44eefd5

Browse files
authored
Merge branch 'huggingface:main' into main
2 parents 6d19605 + 92fe689 commit 44eefd5

File tree

73 files changed

+7636
-171
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+7636
-171
lines changed

.github/workflows/nightly_tests.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ jobs:
142142
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
143143
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
144144
CUBLAS_WORKSPACE_CONFIG: :16:8
145+
RUN_COMPILE: yes
145146
run: |
146147
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
147148
-s -v -k "not Flax and not Onnx" \
@@ -525,6 +526,60 @@ jobs:
525526
pip install slack_sdk tabulate
526527
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
527528
529+
run_nightly_pipeline_level_quantization_tests:
530+
name: Torch quantization nightly tests
531+
strategy:
532+
fail-fast: false
533+
max-parallel: 2
534+
runs-on:
535+
group: aws-g6e-xlarge-plus
536+
container:
537+
image: diffusers/diffusers-pytorch-cuda
538+
options: --shm-size "20gb" --ipc host --gpus 0
539+
steps:
540+
- name: Checkout diffusers
541+
uses: actions/checkout@v3
542+
with:
543+
fetch-depth: 2
544+
- name: NVIDIA-SMI
545+
run: nvidia-smi
546+
- name: Install dependencies
547+
run: |
548+
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
549+
python -m uv pip install -e [quality,test]
550+
python -m uv pip install -U bitsandbytes optimum_quanto
551+
python -m uv pip install pytest-reportlog
552+
- name: Environment
553+
run: |
554+
python utils/print_env.py
555+
- name: Pipeline-level quantization tests on GPU
556+
env:
557+
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
558+
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
559+
CUBLAS_WORKSPACE_CONFIG: :16:8
560+
BIG_GPU_MEMORY: 40
561+
run: |
562+
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
563+
--make-reports=tests_pipeline_level_quant_torch_cuda \
564+
--report-log=tests_pipeline_level_quant_torch_cuda.log \
565+
tests/quantization/test_pipeline_level_quantization.py
566+
- name: Failure short reports
567+
if: ${{ failure() }}
568+
run: |
569+
cat reports/tests_pipeline_level_quant_torch_cuda_stats.txt
570+
cat reports/tests_pipeline_level_quant_torch_cuda_failures_short.txt
571+
- name: Test suite reports artifacts
572+
if: ${{ always() }}
573+
uses: actions/upload-artifact@v4
574+
with:
575+
name: torch_cuda_pipeline_level_quant_reports
576+
path: reports
577+
- name: Generate Report and Notify Channel
578+
if: always()
579+
run: |
580+
pip install slack_sdk tabulate
581+
python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
582+
528583
# M1 runner currently not well supported
529584
# TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
530585
# run_nightly_tests_apple_m1:

docs/source/en/_toctree.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,8 @@
295295
title: CogView4Transformer2DModel
296296
- local: api/models/consisid_transformer3d
297297
title: ConsisIDTransformer3DModel
298+
- local: api/models/cosmos_transformer3d
299+
title: CosmosTransformer3DModel
298300
- local: api/models/dit_transformer2d
299301
title: DiTTransformer2DModel
300302
- local: api/models/easyanimate_transformer3d
@@ -363,6 +365,8 @@
363365
title: AutoencoderKLAllegro
364366
- local: api/models/autoencoderkl_cogvideox
365367
title: AutoencoderKLCogVideoX
368+
- local: api/models/autoencoderkl_cosmos
369+
title: AutoencoderKLCosmos
366370
- local: api/models/autoencoder_kl_hunyuan_video
367371
title: AutoencoderKLHunyuanVideo
368372
- local: api/models/autoencoderkl_ltx_video
@@ -433,6 +437,8 @@
433437
title: ControlNet-XS with Stable Diffusion XL
434438
- local: api/pipelines/controlnet_union
435439
title: ControlNetUnion
440+
- local: api/pipelines/cosmos
441+
title: Cosmos
436442
- local: api/pipelines/dance_diffusion
437443
title: Dance Diffusion
438444
- local: api/pipelines/ddim
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License. -->
11+
12+
# AutoencoderKLCosmos
13+
14+
[Cosmos Tokenizers](https://github.com/NVIDIA/Cosmos-Tokenizer).
15+
16+
Supported models:
17+
- [nvidia/Cosmos-1.0-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-1.0-Tokenizer-CV8x8x8)
18+
19+
The model can be loaded with the following code snippet.
20+
21+
```python
22+
from diffusers import AutoencoderKLCosmos
23+
24+
vae = AutoencoderKLCosmos.from_pretrained("nvidia/Cosmos-1.0-Tokenizer-CV8x8x8", subfolder="vae")
25+
```
26+
27+
## AutoencoderKLCosmos
28+
29+
[[autodoc]] AutoencoderKLCosmos
30+
- decode
31+
- encode
32+
- all
33+
34+
## AutoencoderKLOutput
35+
36+
[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
37+
38+
## DecoderOutput
39+
40+
[[autodoc]] models.autoencoders.vae.DecoderOutput
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License. -->
11+
12+
# CosmosTransformer3DModel
13+
14+
A Diffusion Transformer model for 3D video-like data was introduced in [Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
15+
16+
The model can be loaded with the following code snippet.
17+
18+
```python
19+
from diffusers import CosmosTransformer3DModel
20+
21+
transformer = CosmosTransformer3DModel.from_pretrained("nvidia/Cosmos-1.0-Diffusion-7B-Text2World", subfolder="transformer", torch_dtype=torch.bfloat16)
22+
```
23+
24+
## CosmosTransformer3DModel
25+
26+
[[autodoc]] CosmosTransformer3DModel
27+
28+
## Transformer2DModelOutput
29+
30+
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License. -->
14+
15+
# Cosmos
16+
17+
[Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
18+
19+
*Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.*
20+
21+
<Tip>
22+
23+
Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
24+
25+
</Tip>
26+
27+
## CosmosTextToWorldPipeline
28+
29+
[[autodoc]] CosmosTextToWorldPipeline
30+
- all
31+
- __call__
32+
33+
## CosmosVideoToWorldPipeline
34+
35+
[[autodoc]] CosmosVideoToWorldPipeline
36+
- all
37+
- __call__
38+
39+
## CosmosPipelineOutput
40+
41+
[[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput

docs/source/en/api/quantization.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,17 @@ specific language governing permissions and limitations under the License.
1313

1414
# Quantization
1515

16-
Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Diffusers supports 8-bit and 4-bit quantization with [bitsandbytes](https://huggingface.co/docs/bitsandbytes/en/index).
17-
18-
Quantization techniques that aren't supported in Transformers can be added with the [`DiffusersQuantizer`] class.
16+
Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference.
1917

2018
<Tip>
2119

2220
Learn how to quantize models in the [Quantization](../quantization/overview) guide.
2321

2422
</Tip>
2523

24+
## PipelineQuantizationConfig
25+
26+
[[autodoc]] quantizers.PipelineQuantizationConfig
2627

2728
## BitsAndBytesConfig
2829

docs/source/en/quantization/overview.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,90 @@ Diffusers currently supports the following quantization methods.
3939
- [Quanto](./quanto.md)
4040

4141
[This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
42+
43+
## Pipeline-level quantization
44+
45+
Diffusers allows users to directly initialize pipelines from checkpoints that may contain quantized models ([example](https://huggingface.co/hf-internal-testing/flux.1-dev-nf4-pkg)). However, users may want to apply
46+
quantization on-the-fly when initializing a pipeline from a pre-trained and non-quantized checkpoint. You can
47+
do this with [`~quantizers.PipelineQuantizationConfig`].
48+
49+
Start by defining a `PipelineQuantizationConfig`:
50+
51+
```py
52+
import torch
53+
from diffusers import DiffusionPipeline
54+
from diffusers.quantizers.quantization_config import QuantoConfig
55+
from diffusers.quantizers import PipelineQuantizationConfig
56+
from transformers import BitsAndBytesConfig
57+
58+
pipeline_quant_config = PipelineQuantizationConfig(
59+
quant_mapping={
60+
"transformer": QuantoConfig(weights_dtype="int8"),
61+
"text_encoder_2": BitsAndBytesConfig(
62+
load_in_4bit=True, compute_dtype=torch.bfloat16
63+
),
64+
}
65+
)
66+
```
67+
68+
Then pass it to [`~DiffusionPipeline.from_pretrained`] and run inference:
69+
70+
```py
71+
pipe = DiffusionPipeline.from_pretrained(
72+
"black-forest-labs/FLUX.1-dev",
73+
quantization_config=pipeline_quant_config,
74+
torch_dtype=torch.bfloat16,
75+
).to("cuda")
76+
77+
image = pipe("photo of a cute dog").images[0]
78+
```
79+
80+
This method allows for more granular control over the quantization specifications of individual
81+
model-level components of a pipeline. It also allows for different quantization backends for
82+
different components. In the above example, you used a combination of Quanto and BitsandBytes. However,
83+
one caveat of this method is that users need to know which components come from `transformers` to be able
84+
to import the right quantization config class.
85+
86+
The other method is simpler in terms of experience but is
87+
less-flexible. Start by defining a `PipelineQuantizationConfig` but in a different way:
88+
89+
```py
90+
pipeline_quant_config = PipelineQuantizationConfig(
91+
quant_backend="bitsandbytes_4bit",
92+
quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
93+
components_to_quantize=["transformer", "text_encoder_2"],
94+
)
95+
```
96+
97+
This `pipeline_quant_config` can now be passed to [`~DiffusionPipeline.from_pretrained`] similar to the above example.
98+
99+
In this case, `quant_kwargs` will be used to initialize the quantization specifications
100+
of the respective quantization configuration class of `quant_backend`. `components_to_quantize`
101+
is used to denote the components that will be quantized. For most pipelines, you would want to
102+
keep `transformer` in the list as that is often the most compute and memory intensive.
103+
104+
The config below will work for most diffusion pipelines that have a `transformer` component present.
105+
In most case, you will want to quantize the `transformer` component as that is often the most compute-
106+
intensive part of a diffusion pipeline.
107+
108+
```py
109+
pipeline_quant_config = PipelineQuantizationConfig(
110+
quant_backend="bitsandbytes_4bit",
111+
quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
112+
components_to_quantize=["transformer"],
113+
)
114+
```
115+
116+
Below is a list of the supported quantization backends available in both `diffusers` and `transformers`:
117+
118+
* `bitsandbytes_4bit`
119+
* `bitsandbytes_8bit`
120+
* `gguf`
121+
* `quanto`
122+
* `torchao`
123+
124+
125+
Diffusion pipelines can have multiple text encoders. [`FluxPipeline`] has two, for example. It's
126+
recommended to quantize the text encoders that are memory-intensive. Some examples include T5,
127+
Llama, Gemma, etc. In the above example, you quantized the T5 model of [`FluxPipeline`] through
128+
`text_encoder_2` while keeping the CLIP model intact (accessible through `text_encoder`).

examples/advanced_diffusion_training/train_dreambooth_lora_flux_advanced.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,9 @@ def parse_args(input_args=None):
430430
default=4,
431431
help=("The dimension of the LoRA update matrices."),
432432
)
433+
434+
parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
435+
433436
parser.add_argument(
434437
"--with_prior_preservation",
435438
default=False,
@@ -1554,6 +1557,7 @@ def main(args):
15541557
transformer_lora_config = LoraConfig(
15551558
r=args.rank,
15561559
lora_alpha=args.rank,
1560+
lora_dropout=args.lora_dropout,
15571561
init_lora_weights="gaussian",
15581562
target_modules=target_modules,
15591563
)
@@ -1562,6 +1566,7 @@ def main(args):
15621566
text_lora_config = LoraConfig(
15631567
r=args.rank,
15641568
lora_alpha=args.rank,
1569+
lora_dropout=args.lora_dropout,
15651570
init_lora_weights="gaussian",
15661571
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
15671572
)

examples/advanced_diffusion_training/train_dreambooth_lora_sd15_advanced.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,8 @@ def parse_args(input_args=None):
658658
default=4,
659659
help=("The dimension of the LoRA update matrices."),
660660
)
661+
parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
662+
661663
parser.add_argument(
662664
"--use_dora",
663665
action="store_true",
@@ -1248,6 +1250,7 @@ def main(args):
12481250
unet_lora_config = LoraConfig(
12491251
r=args.rank,
12501252
lora_alpha=args.rank,
1253+
lora_dropout=args.lora_dropout,
12511254
use_dora=args.use_dora,
12521255
init_lora_weights="gaussian",
12531256
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
@@ -1260,6 +1263,7 @@ def main(args):
12601263
text_lora_config = LoraConfig(
12611264
r=args.rank,
12621265
lora_alpha=args.rank,
1266+
lora_dropout=args.lora_dropout,
12631267
use_dora=args.use_dora,
12641268
init_lora_weights="gaussian",
12651269
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],

examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,9 @@ def parse_args(input_args=None):
767767
default=4,
768768
help=("The dimension of the LoRA update matrices."),
769769
)
770+
771+
parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
772+
770773
parser.add_argument(
771774
"--use_dora",
772775
action="store_true",
@@ -1558,6 +1561,7 @@ def main(args):
15581561
r=args.rank,
15591562
use_dora=args.use_dora,
15601563
lora_alpha=args.rank,
1564+
lora_dropout=args.lora_dropout,
15611565
init_lora_weights="gaussian",
15621566
target_modules=target_modules,
15631567
)
@@ -1570,6 +1574,7 @@ def main(args):
15701574
r=args.rank,
15711575
use_dora=args.use_dora,
15721576
lora_alpha=args.rank,
1577+
lora_dropout=args.lora_dropout,
15731578
init_lora_weights="gaussian",
15741579
target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
15751580
)

0 commit comments

Comments
 (0)