Skip to content

Commit 0bf4c49

Browse files
committed
Merge branch 'main' into aritra/qunat-blog
OK
2 parents 1a0d9d0 + 04bba38 commit 0bf4c49

31 files changed

+2105
-1349
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l
112112
| **Documentation** | **What can I learn?** |
113113
|---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
114114
| [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview) | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. |
115-
| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. |
116-
| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. |
115+
| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. |
116+
| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/overview_techniques) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. |
117117
| [Optimization](https://huggingface.co/docs/diffusers/optimization/fp16) | Guides for how to optimize your diffusion model to run faster and consume less memory. |
118118
| [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. |
119119
## Contribution

src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ def create_forward(*inputs):
433433
hidden_states,
434434
temb,
435435
zq,
436-
conv_cache=conv_cache.get(conv_cache_key),
436+
conv_cache.get(conv_cache_key),
437437
)
438438
else:
439439
hidden_states, new_conv_cache[conv_cache_key] = resnet(
@@ -531,7 +531,7 @@ def create_forward(*inputs):
531531
return create_forward
532532

533533
hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
534-
create_custom_forward(resnet), hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
534+
create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key)
535535
)
536536
else:
537537
hidden_states, new_conv_cache[conv_cache_key] = resnet(
@@ -649,7 +649,7 @@ def create_forward(*inputs):
649649
hidden_states,
650650
temb,
651651
zq,
652-
conv_cache=conv_cache.get(conv_cache_key),
652+
conv_cache.get(conv_cache_key),
653653
)
654654
else:
655655
hidden_states, new_conv_cache[conv_cache_key] = resnet(
@@ -789,7 +789,7 @@ def custom_forward(*inputs):
789789
hidden_states,
790790
temb,
791791
None,
792-
conv_cache=conv_cache.get(conv_cache_key),
792+
conv_cache.get(conv_cache_key),
793793
)
794794

795795
# 2. Mid
@@ -798,14 +798,14 @@ def custom_forward(*inputs):
798798
hidden_states,
799799
temb,
800800
None,
801-
conv_cache=conv_cache.get("mid_block"),
801+
conv_cache.get("mid_block"),
802802
)
803803
else:
804804
# 1. Down
805805
for i, down_block in enumerate(self.down_blocks):
806806
conv_cache_key = f"down_block_{i}"
807807
hidden_states, new_conv_cache[conv_cache_key] = down_block(
808-
hidden_states, temb, None, conv_cache=conv_cache.get(conv_cache_key)
808+
hidden_states, temb, None, conv_cache.get(conv_cache_key)
809809
)
810810

811811
# 2. Mid
@@ -953,7 +953,7 @@ def custom_forward(*inputs):
953953
hidden_states,
954954
temb,
955955
sample,
956-
conv_cache=conv_cache.get("mid_block"),
956+
conv_cache.get("mid_block"),
957957
)
958958

959959
# 2. Up
@@ -964,7 +964,7 @@ def custom_forward(*inputs):
964964
hidden_states,
965965
temb,
966966
sample,
967-
conv_cache=conv_cache.get(conv_cache_key),
967+
conv_cache.get(conv_cache_key),
968968
)
969969
else:
970970
# 1. Mid
@@ -1476,7 +1476,7 @@ def forward(
14761476
z = posterior.sample(generator=generator)
14771477
else:
14781478
z = posterior.mode()
1479-
dec = self.decode(z)
1479+
dec = self.decode(z).sample
14801480
if not return_dict:
14811481
return (dec,)
1482-
return dec
1482+
return DecoderOutput(sample=dec)

src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
import itertools
1415
from typing import Dict, Optional, Tuple, Union
1516

1617
import torch
@@ -94,7 +95,7 @@ def forward(
9495

9596
sample = self.conv_in(sample)
9697

97-
upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
98+
upscale_dtype = next(itertools.chain(self.up_blocks.parameters(), self.up_blocks.buffers())).dtype
9899
if torch.is_grad_enabled() and self.gradient_checkpointing:
99100

100101
def create_custom_forward(module):
@@ -228,14 +229,6 @@ def __init__(
228229

229230
self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
230231

231-
sample_size = (
232-
self.config.sample_size[0]
233-
if isinstance(self.config.sample_size, (list, tuple))
234-
else self.config.sample_size
235-
)
236-
self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
237-
self.tile_overlap_factor = 0.25
238-
239232
def _set_gradient_checkpointing(self, module, value=False):
240233
if isinstance(module, (Encoder, TemporalDecoder)):
241234
module.gradient_checkpointing = value

src/diffusers/models/autoencoders/autoencoder_tiny.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,9 @@ def decode(
310310
self, x: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True
311311
) -> Union[DecoderOutput, Tuple[torch.Tensor]]:
312312
if self.use_slicing and x.shape[0] > 1:
313-
output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
313+
output = [
314+
self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x_slice) for x_slice in x.split(1)
315+
]
314316
output = torch.cat(output)
315317
else:
316318
output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
@@ -341,7 +343,7 @@ def forward(
341343
# as if we were loading the latents from an RGBA uint8 image.
342344
unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
343345

344-
dec = self.decode(unscaled_enc)
346+
dec = self.decode(unscaled_enc).sample
345347

346348
if not return_dict:
347349
return (dec,)

src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,6 @@ def __call__(
387387
prompt: Union[str, List[str]] = None,
388388
negative_prompt: Union[str, List[str]] = None,
389389
num_inference_steps: int = 50,
390-
timesteps: List[int] = None,
391390
sigmas: List[float] = None,
392391
guidance_scale: float = 3.5,
393392
num_images_per_prompt: Optional[int] = 1,
@@ -424,10 +423,6 @@ def __call__(
424423
sigmas (`List[float]`, *optional*):
425424
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
426425
`num_inference_steps` and `timesteps` must be `None`.
427-
timesteps (`List[int]`, *optional*):
428-
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
429-
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
430-
passed will be used. Must be in descending order.
431426
guidance_scale (`float`, *optional*, defaults to 5.0):
432427
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
433428
`guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -522,9 +517,7 @@ def __call__(
522517
# 4. Prepare timesteps
523518

524519
# sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
525-
timesteps, num_inference_steps = retrieve_timesteps(
526-
self.scheduler, num_inference_steps, device, timesteps, sigmas
527-
)
520+
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
528521

529522
# 5. Prepare latents.
530523
latent_channels = self.transformer.config.in_channels

src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ def __call__(
733733
height: Optional[int] = None,
734734
width: Optional[int] = None,
735735
num_inference_steps: int = 28,
736-
timesteps: List[int] = None,
736+
sigmas: Optional[List[float]] = None,
737737
guidance_scale: float = 7.0,
738738
control_guidance_start: Union[float, List[float]] = 0.0,
739739
control_guidance_end: Union[float, List[float]] = 1.0,
@@ -778,10 +778,10 @@ def __call__(
778778
num_inference_steps (`int`, *optional*, defaults to 50):
779779
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
780780
expense of slower inference.
781-
timesteps (`List[int]`, *optional*):
782-
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
783-
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
784-
passed will be used. Must be in descending order.
781+
sigmas (`List[float]`, *optional*):
782+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
783+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
784+
will be used.
785785
guidance_scale (`float`, *optional*, defaults to 5.0):
786786
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
787787
`guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -998,7 +998,7 @@ def __call__(
998998
assert False
999999

10001000
# 4. Prepare timesteps
1001-
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
1001+
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
10021002
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
10031003
self._num_timesteps = len(timesteps)
10041004

src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -787,7 +787,7 @@ def __call__(
787787
height: Optional[int] = None,
788788
width: Optional[int] = None,
789789
num_inference_steps: int = 28,
790-
timesteps: List[int] = None,
790+
sigmas: Optional[List[float]] = None,
791791
guidance_scale: float = 7.0,
792792
control_guidance_start: Union[float, List[float]] = 0.0,
793793
control_guidance_end: Union[float, List[float]] = 1.0,
@@ -833,10 +833,10 @@ def __call__(
833833
num_inference_steps (`int`, *optional*, defaults to 50):
834834
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
835835
expense of slower inference.
836-
timesteps (`List[int]`, *optional*):
837-
Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
838-
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
839-
passed will be used. Must be in descending order.
836+
sigmas (`List[float]`, *optional*):
837+
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
838+
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
839+
will be used.
840840
guidance_scale (`float`, *optional*, defaults to 5.0):
841841
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
842842
`guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -1033,7 +1033,7 @@ def __call__(
10331033
controlnet_pooled_projections = controlnet_pooled_projections or pooled_prompt_embeds
10341034

10351035
# 4. Prepare timesteps
1036-
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
1036+
timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas)
10371037
num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
10381038
self._num_timesteps = len(timesteps)
10391039

0 commit comments

Comments
 (0)