Skip to content

Commit 1465133

Browse files
authored
Merge branch 'main' into to-single-file/wan
2 parents df63133 + 9db9be6 commit 1465133

File tree

14 files changed

+259
-236
lines changed

14 files changed

+259
-236
lines changed

docs/source/en/_toctree.yml

Lines changed: 164 additions & 153 deletions
Large diffs are not rendered by default.

docs/source/en/optimization/fp16.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,12 @@ The `step()` function is [called](https://github.com/huggingface/diffusers/blob/
239239

240240
In general, the `sigmas` should [stay on the CPU](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240) to avoid the communication sync and latency.
241241

242+
<Tip>
243+
244+
Refer to the [torch.compile and Diffusers: A Hands-On Guide to Peak Performance](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/) blog post for maximizing performance with `torch.compile` for diffusion models.
245+
246+
</Tip>
247+
242248
### Benchmarks
243249

244250
Refer to the [diffusers/benchmarks](https://huggingface.co/datasets/diffusers/benchmarks) dataset to see inference latency and memory usage data for compiled pipelines.
@@ -298,4 +304,6 @@ pipeline.fuse_qkv_projections()
298304

299305
- Read the [Presenting Flux Fast: Making Flux go brrr on H100s](https://pytorch.org/blog/presenting-flux-fast-making-flux-go-brrr-on-h100s/) blog post to learn more about how you can combine all of these optimizations with [TorchInductor](https://docs.pytorch.org/docs/stable/torch.compiler.html) and [AOTInductor](https://docs.pytorch.org/docs/stable/torch.compiler_aot_inductor.html) for a ~2.5x speedup using recipes from [flux-fast](https://github.com/huggingface/flux-fast).
300306

301-
These recipes support AMD hardware and [Flux.1 Kontext Dev](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev).
307+
These recipes support AMD hardware and [Flux.1 Kontext Dev](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev).
308+
- Read the [torch.compile and Diffusers: A Hands-On Guide to Peak Performance](https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/) blog post
309+
to maximize performance when using `torch.compile`.

docs/source/en/tutorials/tutorial_overview.md

Lines changed: 0 additions & 23 deletions
This file was deleted.

docs/source/en/using-diffusers/overview_techniques.md

Lines changed: 0 additions & 18 deletions
This file was deleted.

examples/dreambooth/train_dreambooth_lora_flux_kontext.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1614,7 +1614,7 @@ def load_model_hook(models, input_dir):
16141614
)
16151615
if args.cond_image_column is not None:
16161616
logger.info("I2I fine-tuning enabled.")
1617-
batch_sampler = BucketBatchSampler(train_dataset, batch_size=args.train_batch_size, drop_last=False)
1617+
batch_sampler = BucketBatchSampler(train_dataset, batch_size=args.train_batch_size, drop_last=True)
16181618
train_dataloader = torch.utils.data.DataLoader(
16191619
train_dataset,
16201620
batch_sampler=batch_sampler,

tests/pipelines/cosmos/test_cosmos.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,15 @@ def test_inference(self):
153153
inputs = self.get_dummy_inputs(device)
154154
video = pipe(**inputs).frames
155155
generated_video = video[0]
156-
157156
self.assertEqual(generated_video.shape, (9, 3, 32, 32))
158-
expected_video = torch.randn(9, 3, 32, 32)
159-
max_diff = np.abs(generated_video - expected_video).max()
160-
self.assertLessEqual(max_diff, 1e10)
157+
158+
# fmt: off
159+
expected_slice = torch.tensor([0.0, 0.9686, 0.8549, 0.8078, 0.0, 0.8431, 1.0, 0.4863, 0.7098, 0.1098, 0.8157, 0.4235, 0.6353, 0.2549, 0.5137, 0.5333])
160+
# fmt: on
161+
162+
generated_slice = generated_video.flatten()
163+
generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
164+
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
161165

162166
def test_callback_inputs(self):
163167
sig = inspect.signature(self.pipeline_class.__call__)

tests/pipelines/cosmos/test_cosmos2_text2image.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,15 @@ def test_inference(self):
140140
inputs = self.get_dummy_inputs(device)
141141
image = pipe(**inputs).images
142142
generated_image = image[0]
143-
144143
self.assertEqual(generated_image.shape, (3, 32, 32))
145-
expected_video = torch.randn(3, 32, 32)
146-
max_diff = np.abs(generated_image - expected_video).max()
147-
self.assertLessEqual(max_diff, 1e10)
144+
145+
# fmt: off
146+
expected_slice = torch.tensor([0.451, 0.451, 0.4471, 0.451, 0.451, 0.451, 0.451, 0.451, 0.4784, 0.4784, 0.4784, 0.4784, 0.4784, 0.4902, 0.4588, 0.5333])
147+
# fmt: on
148+
149+
generated_slice = generated_image.flatten()
150+
generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
151+
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
148152

149153
def test_callback_inputs(self):
150154
sig = inspect.signature(self.pipeline_class.__call__)

tests/pipelines/cosmos/test_cosmos2_video2world.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,15 @@ def test_inference(self):
147147
inputs = self.get_dummy_inputs(device)
148148
video = pipe(**inputs).frames
149149
generated_video = video[0]
150-
151150
self.assertEqual(generated_video.shape, (9, 3, 32, 32))
152-
expected_video = torch.randn(9, 3, 32, 32)
153-
max_diff = np.abs(generated_video - expected_video).max()
154-
self.assertLessEqual(max_diff, 1e10)
151+
152+
# fmt: off
153+
expected_slice = torch.tensor([0.451, 0.451, 0.4471, 0.451, 0.451, 0.451, 0.451, 0.451, 0.5098, 0.5137, 0.5176, 0.5098, 0.5255, 0.5412, 0.5098, 0.5059])
154+
# fmt: on
155+
156+
generated_slice = generated_video.flatten()
157+
generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
158+
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
155159

156160
def test_components_function(self):
157161
init_components = self.get_dummy_components()

tests/pipelines/cosmos/test_cosmos_video2world.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,15 @@ def test_inference(self):
159159
inputs = self.get_dummy_inputs(device)
160160
video = pipe(**inputs).frames
161161
generated_video = video[0]
162-
163162
self.assertEqual(generated_video.shape, (9, 3, 32, 32))
164-
expected_video = torch.randn(9, 3, 32, 32)
165-
max_diff = np.abs(generated_video - expected_video).max()
166-
self.assertLessEqual(max_diff, 1e10)
163+
164+
# fmt: off
165+
expected_slice = torch.tensor([0.0, 0.8275, 0.7529, 0.7294, 0.0, 0.6, 1.0, 0.3804, 0.6667, 0.0863, 0.8784, 0.5922, 0.6627, 0.2784, 0.5725, 0.7765])
166+
# fmt: on
167+
168+
generated_slice = generated_video.flatten()
169+
generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
170+
self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3))
167171

168172
def test_components_function(self):
169173
init_components = self.get_dummy_components()

tests/pipelines/hidream_image/test_pipeline_hidream.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,15 @@ def test_inference(self):
146146
inputs = self.get_dummy_inputs(device)
147147
image = pipe(**inputs)[0]
148148
generated_image = image[0]
149-
150149
self.assertEqual(generated_image.shape, (128, 128, 3))
151-
expected_image = torch.randn(128, 128, 3).numpy()
152-
max_diff = np.abs(generated_image - expected_image).max()
153-
self.assertLessEqual(max_diff, 1e10)
150+
151+
# fmt: off
152+
expected_slice = np.array([0.4507, 0.5256, 0.4205, 0.5791, 0.4848, 0.4831, 0.4443, 0.5107, 0.6586, 0.3163, 0.7318, 0.5933, 0.6252, 0.5512, 0.5357, 0.5983])
153+
# fmt: on
154+
155+
generated_slice = generated_image.flatten()
156+
generated_slice = np.concatenate([generated_slice[:8], generated_slice[-8:]])
157+
self.assertTrue(np.allclose(generated_slice, expected_slice, atol=1e-3))
154158

155159
def test_inference_batch_single_identical(self):
156160
super().test_inference_batch_single_identical(expected_max_diff=3e-4)

0 commit comments

Comments
 (0)