apply review suggestions

a-r-r-o-w · a-r-r-o-w · commit e8768e58bd79 · 2025-04-23T21:39:56.000+02:00
diff --git a/src/diffusers/guiders/classifier_free_guidance.py b/src/diffusers/guiders/classifier_free_guidance.py
@@ -13,12 +13,15 @@
 # limitations under the License.
 
 import math
-from typing import Optional, Union, Tuple, List
+from typing import Optional, Union, Tuple, List, TYPE_CHECKING
 
 import torch
 
 from .guider_utils import BaseGuidance, rescale_noise_cfg, _default_prepare_inputs
 
+if TYPE_CHECKING:
+    from ..pipelines.modular_pipeline import BlockState
+
 
 class ClassifierFreeGuidance(BaseGuidance):
     """
@@ -72,15 +75,13 @@ def __init__(
         self.guidance_rescale = guidance_rescale
         self.use_original_formulation = use_original_formulation
     
-    def prepare_inputs(self, denoiser: torch.nn.Module, *args: Union[Tuple[torch.Tensor], List[torch.Tensor]]) -> Tuple[List[torch.Tensor], ...]:
-        return _default_prepare_inputs(denoiser, self.num_conditions, *args)
-
-    def prepare_outputs(self, denoiser: torch.nn.Module, pred: torch.Tensor) -> None:
-        self._num_outputs_prepared += 1
-        if self._num_outputs_prepared > self.num_conditions:
-            raise ValueError(f"Expected {self.num_conditions} outputs, but prepare_outputs called more times.")
-        key = self._input_predictions[self._num_outputs_prepared - 1]
-        self._preds[key] = pred
+    def prepare_inputs(self, data: "BlockState") -> List["BlockState"]:
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for i in range(self.num_conditions):
+            data_batch = self._prepare_batch(self._input_fields, data, tuple_indices[i], self._input_predictions[i])
+            data_batches.append(data_batch)
+        return data_batches
 
     def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> torch.Tensor:
         pred = None
@@ -95,7 +96,7 @@ def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] =
         if self.guidance_rescale > 0.0:
             pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
 
-        return pred
+        return pred, {}
 
     @property
     def is_conditional(self) -> bool:
diff --git a/src/diffusers/guiders/entropy_rectifying_guidance.py b/src/diffusers/guiders/entropy_rectifying_guidance.py
diff --git a/src/diffusers/guiders/guider_utils.py b/src/diffusers/guiders/guider_utils.py
@@ -21,6 +21,7 @@
 
 if TYPE_CHECKING:
     from ..models.attention_processor import AttentionProcessor
+    from ..pipelines.modular_pipeline import BlockState
 
 
 logger = get_logger(__name__)  # pylint: disable=invalid-name
@@ -30,14 +31,15 @@ class BaseGuidance:
     r"""Base class providing the skeleton for implementing guidance techniques."""
 
     _input_predictions = None
+    _identifier_key = "__guidance_identifier__"
 
     def __init__(self, start: float = 0.0, stop: float = 1.0):
         self._start = start
         self._stop = stop
         self._step: int = None
         self._num_inference_steps: int = None
         self._timestep: torch.LongTensor = None
-        self._preds: Dict[str, torch.Tensor] = {}
+        self._input_fields: Dict[str, Union[str, Tuple[str, str]]] = None
         self._num_outputs_prepared: int = 0
         self._enabled = True
 
@@ -65,28 +67,64 @@ def set_state(self, step: int, num_inference_steps: int, timestep: torch.LongTen
         self._step = step
         self._num_inference_steps = num_inference_steps
         self._timestep = timestep
-        self._preds = {}
         self._num_outputs_prepared = 0
 
+    def set_input_fields(self, **kwargs: Dict[str, Union[str, Tuple[str, str]]]) -> None:
+        """
+        Set the input fields for the guidance technique. The input fields are used to specify the names of the
+        returned attributes containing the prepared data after `prepare_inputs` is called. The prepared data is
+        obtained from the values of the provided keyword arguments to this method.
+
+        Args:
+            **kwargs (`Dict[str, Union[str, Tuple[str, str]]]`):
+                A dictionary where the keys are the names of the fields that will be used to store the data once
+                it is prepared with `prepare_inputs`. The values can be either a string or a tuple of length 2,
+                which is used to look up the required data provided for preparation.
+
+                If a string is provided, it will be used as the conditional data (or unconditional if used with
+                a guidance method that requires it). If a tuple of length 2 is provided, the first element must
+                be the conditional data identifier and the second element must be the unconditional data identifier
+                or None.
+
+                Example:
+                
+                ```
+                data = {"prompt_embeds": <some tensor>, "negative_prompt_embeds": <some tensor>, "latents": <some tensor>}
+
+                BaseGuidance.set_input_fields(
+                    latents="latents",
+                    prompt_embeds=("prompt_embeds", "negative_prompt_embeds"),
+                )
+                ```
+        """
+        for key, value in kwargs.items():
+            is_string = isinstance(value, str)
+            is_tuple_of_str_with_len_2 = isinstance(value, tuple) and len(value) == 2 and all(isinstance(v, str) for v in value)
+            if not (is_string or is_tuple_of_str_with_len_2):
+                raise ValueError(
+                    f"Expected `set_input_fields` to be called with a string or a tuple of string with length 2, but got {type(value)} for key {key}."
+                )
+        self._input_fields = kwargs
+    
     def prepare_models(self, denoiser: torch.nn.Module) -> None:
         """
         Prepares the models for the guidance technique on a given batch of data. This method should be overridden in
         subclasses to implement specific model preparation logic.
         """
         pass
     
-    def prepare_inputs(self, denoiser: torch.nn.Module, *args: Union[Tuple[torch.Tensor], List[torch.Tensor]]) -> Tuple[List[torch.Tensor], ...]:
+    def prepare_inputs(self, data: "BlockState") -> List["BlockState"]:
         raise NotImplementedError("BaseGuidance::prepare_inputs must be implemented in subclasses.")
 
-    def prepare_outputs(self, denoiser: torch.nn.Module, pred: torch.Tensor) -> None:
-        raise NotImplementedError("BaseGuidance::prepare_outputs must be implemented in subclasses.")
-
-    def __call__(self, **kwargs) -> Any:
-        if len(kwargs) != self.num_conditions:
+    def __call__(self, data: List["BlockState"]) -> Any:
+        if not all(hasattr(d, "noise_pred") for d in data):
+            raise ValueError("Expected all data to have `noise_pred` attribute.")
+        if len(data) != self.num_conditions:
             raise ValueError(
-                f"Expected {self.num_conditions} arguments, but got {len(kwargs)}. Please provide the correct number of arguments."
+                f"Expected {self.num_conditions} data items, but got {len(data)}. Please check the input data."
             )
-        return self.forward(**kwargs)
+        forward_inputs = {getattr(d, self._identifier_key): d.noise_pred for d in data}
+        return self.forward(**forward_inputs)
 
     def forward(self, *args, **kwargs) -> Any:
         raise NotImplementedError("BaseGuidance::forward must be implemented in subclasses.")
@@ -102,10 +140,48 @@ def is_unconditional(self) -> bool:
     @property
     def num_conditions(self) -> int:
         raise NotImplementedError("BaseGuidance::num_conditions must be implemented in subclasses.")
-
-    @property
-    def outputs(self) -> Dict[str, torch.Tensor]:
-        return self._preds, {}
+    
+    @classmethod
+    def _prepare_batch(cls, input_fields: Dict[str, Union[str, Tuple[str, str]]], data: "BlockState", tuple_index: int, identifier: str) -> "BlockState":
+        """
+        Prepares a batch of data for the guidance technique. This method is used in the `prepare_inputs` method of
+        the `BaseGuidance` class. It prepares the batch based on the provided tuple index.
+
+        Args:
+            input_fields (`Dict[str, Union[str, Tuple[str, str]]]`):
+                A dictionary where the keys are the names of the fields that will be used to store the data once
+                it is prepared with `prepare_inputs`. The values can be either a string or a tuple of length 2,
+                which is used to look up the required data provided for preparation.
+                If a string is provided, it will be used as the conditional data (or unconditional if used with
+                a guidance method that requires it). If a tuple of length 2 is provided, the first element must
+                be the conditional data identifier and the second element must be the unconditional data identifier
+                or None.
+            data (`BlockState`):
+                The input data to be prepared.
+            tuple_index (`int`):
+                The index to use when accessing input fields that are tuples.
+        
+        Returns:
+            `BlockState`: The prepared batch of data.
+        """
+        from ..pipelines.modular_pipeline import BlockState
+
+        if input_fields is None:
+            raise ValueError("Input fields have not been set. Please call `set_input_fields` before preparing inputs.")
+        data_batch = {}
+        for key, value in input_fields.items():
+            try:
+                if isinstance(value, str):
+                    data_batch[key] = getattr(data, value)
+                elif isinstance(value, tuple):
+                    data_batch[key] = getattr(data, value[tuple_index])
+                else:
+                    # We've already checked that value is a string or a tuple of strings with length 2
+                    pass
+            except AttributeError:
+                raise ValueError(f"Expected `data` to have attribute(s) {value}, but it does not. Please check the input data.")
+        data_batch[cls._identifier_key] = identifier
+        return BlockState(**data_batch)
 
 
 def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_modular.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_modular.py
@@ -2239,64 +2239,48 @@ def __call__(self, pipeline, state: PipelineState) -> PipelineState:
         data.extra_step_kwargs = self.prepare_extra_step_kwargs(pipeline, data.generator, data.eta)
         data.num_warmup_steps = max(len(data.timesteps) - data.num_inference_steps * pipeline.scheduler.order, 0)
 
+        pipeline.guider.set_input_fields(
+            prompt_embeds=("prompt_embeds", "negative_prompt_embeds"),
+            add_time_ids=("add_time_ids", "negative_add_time_ids"),
+            pooled_prompt_embeds=("pooled_prompt_embeds", "negative_pooled_prompt_embeds"),
+            ip_adapter_embeds=("ip_adapter_embeds", "negative_ip_adapter_embeds"),
+        )
+
         with pipeline.progress_bar(total=data.num_inference_steps) as progress_bar:
             for i, t in enumerate(data.timesteps):
                 pipeline.guider.set_state(step=i, num_inference_steps=data.num_inference_steps, timestep=t)
+                guider_data = pipeline.guider.prepare_inputs(data)
 
-                (
-                    latents,
-                    prompt_embeds,
-                    add_time_ids,
-                    pooled_prompt_embeds,
-                    mask,
-                    masked_image_latents,
-                    ip_adapter_embeds,
-                ) = pipeline.guider.prepare_inputs(
-                    pipeline.unet,
-                    data.latents,
-                    (data.prompt_embeds, data.negative_prompt_embeds),
-                    (data.add_time_ids, data.negative_add_time_ids),
-                    (data.pooled_prompt_embeds, data.negative_pooled_prompt_embeds),
-                    data.mask,
-                    data.masked_image_latents,
-                    (data.ip_adapter_embeds, data.negative_ip_adapter_embeds),
-                )
-
-                for batch_index, (
-                    latents_i, prompt_embeds_i, add_time_ids_i, pooled_prompt_embeds_i, mask_i, masked_image_latents_i, ip_adapter_embeds_i,
-                ) in enumerate(zip(
-                    latents, prompt_embeds, add_time_ids, pooled_prompt_embeds, mask, masked_image_latents, ip_adapter_embeds
-                )):
+                data.scaled_latents = pipeline.scheduler.scale_model_input(data.latents, t)
+                
+                # Prepare for inpainting
+                if data.num_channels_unet == 9:
+                    data.scaled_latents = torch.cat([data.scaled_latents, data.mask, data.masked_image_latents], dim=1)
+                
+                for batch in guider_data:
                     pipeline.guider.prepare_models(pipeline.unet)
-                    latents_i = pipeline.scheduler.scale_model_input(latents_i, t)
-                    
-                    # Prepare for inpainting
-                    if data.num_channels_unet == 9:
-                        latents_i = torch.cat([latents_i, mask_i, masked_image_latents_i], dim=1)
                     
                     # Prepare additional conditionings
-                    data.added_cond_kwargs = {
-                        "text_embeds": pooled_prompt_embeds_i,
-                        "time_ids": add_time_ids_i,
+                    batch.added_cond_kwargs = {
+                        "text_embeds": batch.pooled_prompt_embeds,
+                        "time_ids": batch.add_time_ids,
                     }
-                    if ip_adapter_embeds_i is not None:
-                        data.added_cond_kwargs["image_embeds"] = ip_adapter_embeds_i
-
+                    if batch.ip_adapter_embeds is not None:
+                        batch.added_cond_kwargs["image_embeds"] = batch.ip_adapter_embeds
+                    
                     # Predict the noise residual
-                    data.noise_pred = pipeline.unet(
-                        latents_i,
+                    batch.noise_pred = pipeline.unet(
+                        data.scaled_latents,
                         t,
-                        encoder_hidden_states=prompt_embeds_i,
+                        encoder_hidden_states=batch.prompt_embeds,
                         timestep_cond=data.timestep_cond,
                         cross_attention_kwargs=data.cross_attention_kwargs,
-                        added_cond_kwargs=data.added_cond_kwargs,
+                        added_cond_kwargs=batch.added_cond_kwargs,
                         return_dict=False,
                     )[0]
-                    data.noise_pred = pipeline.guider.prepare_outputs(pipeline.unet, data.noise_pred)
 
                 # Perform guidance
-                outputs, scheduler_step_kwargs = pipeline.guider.outputs
-                data.noise_pred = pipeline.guider(**outputs)
+                data.noise_pred, scheduler_step_kwargs = pipeline.guider(guider_data)
                 
                 # Perform scheduler step using the predicted output
                 data.latents_dtype = data.latents.dtype