|  | 
|  | 1 | +# Copyright 2025 The HuggingFace Team. All rights reserved. | 
|  | 2 | +# | 
|  | 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 4 | +# you may not use this file except in compliance with the License. | 
|  | 5 | +# You may obtain a copy of the License at | 
|  | 6 | +# | 
|  | 7 | +#     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 8 | +# | 
|  | 9 | +# Unless required by applicable law or agreed to in writing, software | 
|  | 10 | +# distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 12 | +# See the License for the specific language governing permissions and | 
|  | 13 | +# limitations under the License. | 
|  | 14 | + | 
|  | 15 | +from typing import List, Optional, Union | 
|  | 16 | + | 
|  | 17 | +from ..hooks import LayerSkipConfig | 
|  | 18 | +from .skip_layer_guidance import SkipLayerGuidance | 
|  | 19 | + | 
|  | 20 | + | 
|  | 21 | +class PerturbedAttentionGuidance(SkipLayerGuidance): | 
|  | 22 | +    """ | 
|  | 23 | +    Perturbed Attention Guidance (PAG): https://huggingface.co/papers/2403.17377 | 
|  | 24 | +
 | 
|  | 25 | +    The intution behind PAG can be thought of as moving the CFG predicted distribution estimates further away from | 
|  | 26 | +    worse versions of the conditional distribution estimates. PAG was one of the first techniques to introduce the idea | 
|  | 27 | +    of using a worse version of the trained model for better guiding itself in the denoising process. It perturbs the | 
|  | 28 | +    attention scores of the latent stream by replacing the score matrix with an identity matrix for selectively chosen | 
|  | 29 | +    layers. | 
|  | 30 | +
 | 
|  | 31 | +    Additional reading: | 
|  | 32 | +    - [Guiding a Diffusion Model with a Bad Version of Itself](https://huggingface.co/papers/2406.02507) | 
|  | 33 | +
 | 
|  | 34 | +    PAG is implemented as a specialization of the SkipLayerGuidance due to similarities in the configuration parameters | 
|  | 35 | +    and implementation details. | 
|  | 36 | +
 | 
|  | 37 | +    Args: | 
|  | 38 | +        guidance_scale (`float`, defaults to `7.5`): | 
|  | 39 | +            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text | 
|  | 40 | +            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and | 
|  | 41 | +            deterioration of image quality. | 
|  | 42 | +        perturbed_guidance_scale (`float`, defaults to `2.8`): | 
|  | 43 | +            The scale parameter for perturbed attention guidance. | 
|  | 44 | +        perturbed_guidance_start (`float`, defaults to `0.01`): | 
|  | 45 | +            The fraction of the total number of denoising steps after which perturbed attention guidance starts. | 
|  | 46 | +        perturbed_guidance_stop (`float`, defaults to `0.2`): | 
|  | 47 | +            The fraction of the total number of denoising steps after which perturbed attention guidance stops. | 
|  | 48 | +        perturbed_guidance_layers (`int` or `List[int]`, *optional*): | 
|  | 49 | +            The layer indices to apply perturbed attention guidance to. Can be a single integer or a list of integers. | 
|  | 50 | +            If not provided, `skip_layer_config` must be provided. | 
|  | 51 | +        skip_layer_config (`LayerSkipConfig` or `List[LayerSkipConfig]`, *optional*): | 
|  | 52 | +            The configuration for the perturbed attention guidance. Can be a single `LayerSkipConfig` or a list of | 
|  | 53 | +            `LayerSkipConfig`. If not provided, `perturbed_guidance_layers` must be provided. | 
|  | 54 | +        guidance_rescale (`float`, defaults to `0.0`): | 
|  | 55 | +            The rescale factor applied to the noise predictions. This is used to improve image quality and fix | 
|  | 56 | +            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are | 
|  | 57 | +            Flawed](https://huggingface.co/papers/2305.08891). | 
|  | 58 | +        use_original_formulation (`bool`, defaults to `False`): | 
|  | 59 | +            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default, | 
|  | 60 | +            we use the diffusers-native implementation that has been in the codebase for a long time. See | 
|  | 61 | +            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details. | 
|  | 62 | +        start (`float`, defaults to `0.01`): | 
|  | 63 | +            The fraction of the total number of denoising steps after which guidance starts. | 
|  | 64 | +        stop (`float`, defaults to `0.2`): | 
|  | 65 | +            The fraction of the total number of denoising steps after which guidance stops. | 
|  | 66 | +    """ | 
|  | 67 | + | 
|  | 68 | +    # NOTE: The current implementation does not account for joint latent conditioning (text + image/video tokens in | 
|  | 69 | +    # the same latent stream). It assumes the entire latent is a single stream of visual tokens. It would be very | 
|  | 70 | +    # complex to support joint latent conditioning in a model-agnostic manner without specializing the implementation | 
|  | 71 | +    # for each model architecture. | 
|  | 72 | + | 
|  | 73 | +    def __init__( | 
|  | 74 | +        self, | 
|  | 75 | +        guidance_scale: float = 7.5, | 
|  | 76 | +        perturbed_guidance_scale: float = 2.8, | 
|  | 77 | +        perturbed_guidance_start: float = 0.01, | 
|  | 78 | +        perturbed_guidance_stop: float = 0.2, | 
|  | 79 | +        perturbed_guidance_layers: Optional[Union[int, List[int]]] = None, | 
|  | 80 | +        skip_layer_config: Union[LayerSkipConfig, List[LayerSkipConfig]] = None, | 
|  | 81 | +        guidance_rescale: float = 0.0, | 
|  | 82 | +        use_original_formulation: bool = False, | 
|  | 83 | +        start: float = 0.0, | 
|  | 84 | +        stop: float = 1.0, | 
|  | 85 | +    ): | 
|  | 86 | +        if skip_layer_config is None: | 
|  | 87 | +            if perturbed_guidance_layers is None: | 
|  | 88 | +                raise ValueError( | 
|  | 89 | +                    "`perturbed_guidance_layers` must be provided if `skip_layer_config` is not specified." | 
|  | 90 | +                ) | 
|  | 91 | +            skip_layer_config = LayerSkipConfig( | 
|  | 92 | +                indices=perturbed_guidance_layers, | 
|  | 93 | +                skip_attention=False, | 
|  | 94 | +                skip_attention_scores=True, | 
|  | 95 | +                skip_ff=False, | 
|  | 96 | +            ) | 
|  | 97 | +        else: | 
|  | 98 | +            if perturbed_guidance_layers is not None: | 
|  | 99 | +                raise ValueError( | 
|  | 100 | +                    "`perturbed_guidance_layers` should not be provided if `skip_layer_config` is specified." | 
|  | 101 | +                ) | 
|  | 102 | + | 
|  | 103 | +        super().__init__( | 
|  | 104 | +            guidance_scale=guidance_scale, | 
|  | 105 | +            skip_layer_guidance_scale=perturbed_guidance_scale, | 
|  | 106 | +            skip_layer_guidance_start=perturbed_guidance_start, | 
|  | 107 | +            skip_layer_guidance_stop=perturbed_guidance_stop, | 
|  | 108 | +            skip_layer_guidance_layers=perturbed_guidance_layers, | 
|  | 109 | +            skip_layer_config=skip_layer_config, | 
|  | 110 | +            guidance_rescale=guidance_rescale, | 
|  | 111 | +            use_original_formulation=use_original_formulation, | 
|  | 112 | +            start=start, | 
|  | 113 | +            stop=stop, | 
|  | 114 | +        ) | 
0 commit comments