update

a-r-r-o-w · a-r-r-o-w · commit 535922287bd8 · 2024-12-29T16:24:13.000+01:00
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -334,7 +334,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"):
             " `from_numpy` is no longer required."
             "  Pass `output_type='pt' to use the new version now."
         )
-        deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)
+        # deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)
         return get_1d_sincos_pos_embed_from_grid_np(embed_dim=embed_dim, pos=pos)
     if embed_dim % 2 != 0:
         raise ValueError("embed_dim must be divisible by 2")
diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py
@@ -24,6 +24,7 @@
 from ...loaders import CogVideoXLoraLoaderMixin
 from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 from ...models.embeddings import get_3d_rotary_pos_embed
+from ...models.hooks import reset_stateful_hooks
 from ...pipelines.pipeline_utils import DiffusionPipeline
 from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
 from ...utils import logging, replace_example_docstring
@@ -769,6 +770,7 @@ def __call__(
 
         # Offload all models
         self.maybe_free_model_hooks()
+        reset_stateful_hooks(self.transformer, recurse=True)
 
         if not return_dict:
             return (video,)
diff --git a/src/diffusers/pipelines/faster_cache_utils.py b/src/diffusers/pipelines/faster_cache_utils.py
@@ -49,13 +49,12 @@
 class FasterCacheConfig:
     r"""
     Configuration for [FasterCache](https://huggingface.co/papers/2410.19355).
-    """
 
-    num_train_timesteps: int = 1000
+    Attributes:"""
 
     # In the paper and codebase, they hardcode these values to 2. However, it can be made configurable
     # after some testing. We default to 2 if these parameters are not provided.
-    spatial_attention_block_skip_range: Optional[int] = None
+    spatial_attention_block_skip_range: int = 2
     temporal_attention_block_skip_range: Optional[int] = None
 
     # TODO(aryan): write heuristics for what the best way to obtain these values are
@@ -145,6 +144,9 @@ def apply_faster_cache(
     r"""
     Applies [FasterCache](https://huggingface.co/papers/2410.19355) to a given pipeline.
 
+    Note: FasterCache should only be applied when using classifer-free guidance. It will not work as expected even if
+    the inference runs successfully.
+
     Args:
         pipeline (`DiffusionPipeline`):
             The diffusion pipeline to apply FasterCache to.
@@ -163,15 +165,6 @@ def apply_faster_cache(
     if config is None:
         config = FasterCacheConfig()
 
-    if config.spatial_attention_block_skip_range is None and config.temporal_attention_block_skip_range is None:
-        logger.warning(
-            "FasterCache requires one of `spatial_attention_block_skip_range` and/or `temporal_attention_block_skip_range` "
-            "to be set to an integer, not `None`. Defaulting to using `spatial_attention_block_skip_range=2` and "
-            "`temporal_attention_block_skip_range=2`. To avoid this warning, please set one of the above parameters."
-        )
-        config.spatial_attention_block_skip_range = 2
-        config.temporal_attention_block_skip_range = 2
-
     if config.attention_weight_callback is None:
         # If the user has not provided a weight callback, we default to 0.5 for all timesteps.
         # In the paper, they recommend using a gradually increasing weight from 0 to 1 as the inference progresses, but
@@ -231,12 +224,6 @@ def _apply_fastercache_on_denoiser(
     pipeline: DiffusionPipeline, denoiser: nn.Module, config: FasterCacheConfig
 ) -> None:
     def uncond_skip_callback(module: nn.Module) -> bool:
-        # If we are not using classifier-free guidance, we cannot skip the denoiser computation. We only compute the
-        # conditional branch in this case.
-        is_using_classifier_free_guidance = pipeline.do_classifier_free_guidance
-        if not is_using_classifier_free_guidance:
-            return False
-
         # We skip the unconditional branch only if the following conditions are met:
         #   1. We have completed at least one iteration of the denoiser
         #   2. The current timestep is within the range specified by the user. This is the optimal timestep range
@@ -298,20 +285,13 @@ def _apply_fastercache_on_attention_class(
         return
 
     def skip_callback(module: nn.Module) -> bool:
-        is_using_classifier_free_guidance = pipeline.do_classifier_free_guidance
-        if not is_using_classifier_free_guidance:
-            return False
-
         fastercache_state: FasterCacheState = module._fastercache_state
         is_within_timestep_range = timestep_skip_range[0] < pipeline._current_timestep < timestep_skip_range[1]
 
         if not is_within_timestep_range:
             # We are still not in the phase of inference where skipping attention is possible without minimal quality
             # loss, as described in the paper. So, the attention computation cannot be skipped
             return False
-        if fastercache_state.cache is None or fastercache_state.iteration < 2:
-            # We need at least 2 iterations to start skipping attention computation
-            return False
 
         should_compute_attention = (
             fastercache_state.iteration > 0 and fastercache_state.iteration % block_skip_range == 0
@@ -358,8 +338,6 @@ def new_forward(self, module: nn.Module, *args, **kwargs) -> Any:
             # TODO(aryan): remove later
             logger.debug("Skipping unconditional branch computation")
 
-        if should_skip_uncond:
-            breakpoint()
         output = module._old_forward(*args, **kwargs)
         # TODO(aryan): handle Transformer2DModelOutput
         hidden_states = output[0] if isinstance(output, tuple) else output
@@ -422,6 +400,22 @@ def reset_state(self, module: nn.Module) -> None:
 class FasterCacheBlockHook(ModelHook):
     _is_stateful = True
 
+    def _compute_approximated_attention_output(
+        self, t_2_output: torch.Tensor, t_output: torch.Tensor, weight: float, batch_size: int
+    ) -> torch.Tensor:
+        # TODO(aryan): these conditions may not be needed after latest refactor. they exist for safety. do test if they can be removed
+        if t_2_output.size(0) != batch_size:
+            # The cache t_2_output contains both batchwise-concatenated unconditional-conditional branch outputs. Just
+            # take the conditional branch outputs.
+            assert t_2_output.size(0) == 2 * batch_size
+            t_2_output = t_2_output[batch_size:]
+        if t_output.size(0) != batch_size:
+            # The cache t_output contains both batchwise-concatenated unconditional-conditional branch outputs. Just
+            # take the conditional branch outputs.
+            assert t_output.size(0) == 2 * batch_size
+            t_output = t_output[batch_size:]
+        return t_output + (t_output - t_2_output) * weight
+
     def new_forward(self, module: nn.Module, *args, **kwargs) -> Any:
         args, kwargs = module._diffusers_hook.pre_forward(module, *args, **kwargs)
         state: FasterCacheState = module._fastercache_state
@@ -435,40 +429,59 @@ def new_forward(self, module: nn.Module, *args, **kwargs) -> Any:
             state.batch_size = batch_size
 
         # If we have to skip due to the skip conditions, then let's skip as expected.
-        # But, we can't skip if the denoiser wants to infer both unconditional and conditional branches. So,
-        # if state.batch_size (which is the true unconditional-conditional batch size) is same as the current
-        # batch size, we don't perform the layer skip. Otherwise, we conditionally skip the layer based on
-        # what state.skip_callback returns.
-        if state.skip_callback(module) and state.batch_size != batch_size:
+        # But, we can't skip if the denoiser wants to infer both unconditional and conditional branches. This
+        # is because the expected output shapes of attention layer will not match if we only return values from
+        # the cache (which only caches conditional branch outputs). So, if state.batch_size (which is the true
+        # unconditional-conditional batch size) is same as the current batch size, we don't perform the layer
+        # skip. Otherwise, we conditionally skip the layer based on what state.skip_callback returns.
+        should_skip_attention = state.skip_callback(module) and state.batch_size != batch_size
+
+        if should_skip_attention:
             # TODO(aryan): remove later
-            logger.debug("Skipping layer computation")
-            t_2_output, t_output = state.cache
-
-            # TODO(aryan): these conditions may not be needed after latest refactor. they exist for safety. do test if they can be removed
-            if t_2_output.size(0) != batch_size:
-                # The cache t_2_output contains both batchwise-concatenated unconditional-conditional branch outputs. Just
-                # take the conditional branch outputs.
-                assert t_2_output.size(0) == 2 * batch_size
-                t_2_output = t_2_output[batch_size:]
-            if t_output.size(0) != batch_size:
-                # The cache t_output contains both batchwise-concatenated unconditional-conditional branch outputs. Just
-                # take the conditional branch outputs.
-                assert t_output.size(0) == 2 * batch_size
-                t_output = t_output[batch_size:]
-
-            output = t_output + (t_output - t_2_output) * state.weight_callback(module)
+            logger.debug("Skipping attention")
+
+            if torch.is_tensor(state.cache):
+                t_2_output, t_output = state.cache
+                weight = state.weight_callback(module)
+                output = self._compute_approximated_attention_output(t_2_output, t_output, weight, batch_size)
+            else:
+                # The cache contains multiple tensors from past N iterations (N=2 for FasterCache). We need to handle all of them.
+                # Diffusers blocks can return multiple tensors - let's call them [A, B, C, ...] for simplicity.
+                # In our cache, we would have [[A_1, B_1, C_1, ...], [A_2, B_2, C_2, ...], ...] where each list is the output from
+                # a forward pass of the block. We need to compute the approximated output for each of these tensors.
+                # The zip(*state.cache) operation will give us [(A_1, A_2, ...), (B_1, B_2, ...), (C_1, C_2, ...), ...] which
+                # allows us to compute the approximated attention output for each tensor in the cache.
+                output = ()
+                for t_2_output, t_output in zip(*state.cache):
+                    result = self._compute_approximated_attention_output(
+                        t_2_output, t_output, state.weight_callback(module), batch_size
+                    )
+                    output += (result,)
         else:
+            logger.debug("Computing attention")
             output = module._old_forward(*args, **kwargs)
 
-        # The output here can be both unconditional-conditional branch outputs or just conditional branch outputs.
-        # This is determined at the higher-level denoiser module. We only want to cache the conditional branch outputs.
-        cache_output = output
-        if output.size(0) == state.batch_size:
-            cache_output = cache_output.chunk(2, dim=0)[1]
-
-        # Just to be safe that the output is of the correct size for both unconditional-conditional branch inference
-        # and only-conditional branch inference.
-        assert 2 * cache_output.size(0) == state.batch_size
+        # Note that the following condition for getting hidden_states should suffice since Diffusers blocks either return
+        # a single hidden_states tensor, or a tuple of (hidden_states, encoder_hidden_states) tensors. We need to handle
+        # both cases.
+        if torch.is_tensor(output):
+            cache_output = output
+            if cache_output.size(0) == state.batch_size:
+                # The output here can be both unconditional-conditional branch outputs or just conditional branch outputs.
+                # This is determined at the higher-level denoiser module. We only want to cache the conditional branch outputs.
+                cache_output = cache_output.chunk(2, dim=0)[1]
+
+            # Just to be safe that the output is of the correct size for both unconditional-conditional branch inference
+            # and only-conditional branch inference.
+            assert 2 * cache_output.size(0) == state.batch_size
+        else:
+            # Cache all return values and perform the same operation as above
+            cache_output = ()
+            for out in output:
+                if out.size(0) == state.batch_size:
+                    out = out.chunk(2, dim=0)[1]
+                assert 2 * out.size(0) == state.batch_size
+                cache_output += (out,)
 
         if state.cache is None:
             state.cache = [cache_output, cache_output]
diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video.py
@@ -22,6 +22,7 @@
 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...loaders import HunyuanVideoLoraLoaderMixin
 from ...models import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3DModel
+from ...models.hooks import reset_stateful_hooks
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
@@ -573,6 +574,7 @@ def __call__(
 
         self._guidance_scale = guidance_scale
         self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
         self._interrupt = False
 
         device = self._execution_device
@@ -640,6 +642,7 @@ def __call__(
                 if self.interrupt:
                     continue
 
+                self._current_timestep = t
                 latent_model_input = latents.to(transformer_dtype)
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latents.shape[0]).to(latents.dtype)
@@ -671,6 +674,8 @@ def __call__(
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
 
+        self._current_timestep = None
+
         if not output_type == "latent":
             latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor
             video = self.vae.decode(latents, return_dict=False)[0]
@@ -680,6 +685,7 @@ def __call__(
 
         # Offload all models
         self.maybe_free_model_hooks()
+        reset_stateful_hooks(self.transformer, recurse=True)
 
         if not return_dict:
             return (video,)
diff --git a/src/diffusers/pipelines/mochi/pipeline_mochi.py b/src/diffusers/pipelines/mochi/pipeline_mochi.py
@@ -21,8 +21,8 @@
 
 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...loaders import Mochi1LoraLoaderMixin
-from ...models.autoencoders import AutoencoderKL
-from ...models.transformers import MochiTransformer3DModel
+from ...models import AutoencoderKLHunyuanVideo, MochiTransformer3DModel
+from ...models.hooks import reset_stateful_hooks
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import (
     is_torch_xla_available,
@@ -184,7 +184,7 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
     def __init__(
         self,
         scheduler: FlowMatchEulerDiscreteScheduler,
-        vae: AutoencoderKL,
+        vae: AutoencoderKLHunyuanVideo,
         text_encoder: T5EncoderModel,
         tokenizer: T5TokenizerFast,
         transformer: MochiTransformer3DModel,
@@ -604,6 +604,7 @@ def __call__(
 
         self._guidance_scale = guidance_scale
         self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
         self._interrupt = False
 
         # 2. Define call parameters
@@ -673,6 +674,9 @@ def __call__(
                 if self.interrupt:
                     continue
 
+                # Note: Mochi uses reversed timesteps. To ensure compatibility with methods like FasterCache, we need
+                # to make sure we're using the correct non-reversed timestep values.
+                self._current_timestep = 1000 - t
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
                 timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
@@ -718,6 +722,8 @@ def __call__(
                 if XLA_AVAILABLE:
                     xm.mark_step()
 
+        self._current_timestep = None
+
         if output_type == "latent":
             video = latents
         else:
@@ -741,6 +747,7 @@ def __call__(
 
         # Offload all models
         self.maybe_free_model_hooks()
+        reset_stateful_hooks(self.transformer, recurse=True)
 
         if not return_dict:
             return (video,)

Original file line number	Diff line number	Diff line change
`@@ -334,7 +334,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"):`
`334`	`334`	" `from_numpy` is no longer required."
`335`	`335`	" Pass `output_type='pt' to use the new version now."
`336`	`336`	`)`
`337`		`- deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)`
	`337`	`+ # deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)`
`338`	`338`	`return get_1d_sincos_pos_embed_from_grid_np(embed_dim=embed_dim, pos=pos)`
`339`	`339`	`if embed_dim % 2 != 0:`
`340`	`340`	`raise ValueError("embed_dim must be divisible by 2")`