|  | 
|  | 1 | +# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved. | 
|  | 2 | +# | 
|  | 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 4 | +# you may not use this file except in compliance with the License. | 
|  | 5 | +# You may obtain a copy of the License at | 
|  | 6 | +# | 
|  | 7 | +#     http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 8 | +# | 
|  | 9 | +# Unless required by applicable law or agreed to in writing, software | 
|  | 10 | +# distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 12 | +# See the License for the specific language governing permissions and | 
|  | 13 | +# limitations under the License. | 
|  | 14 | + | 
|  | 15 | + | 
|  | 16 | +import torch | 
|  | 17 | +from transformers import SiglipImageProcessor, SiglipVisionModel | 
|  | 18 | + | 
|  | 19 | +from ...image_processor import PipelineImageInput | 
|  | 20 | +from ...utils import ( | 
|  | 21 | +    is_torch_xla_available, | 
|  | 22 | +    logging, | 
|  | 23 | +    replace_example_docstring, | 
|  | 24 | +) | 
|  | 25 | +from ..pipeline_utils import DiffusionPipeline | 
|  | 26 | +from .modeling_flux import ReduxImageEncoder | 
|  | 27 | +from .pipeline_output import FluxPriorReduxPipelineOutput | 
|  | 28 | + | 
|  | 29 | + | 
|  | 30 | +if is_torch_xla_available(): | 
|  | 31 | +    XLA_AVAILABLE = True | 
|  | 32 | +else: | 
|  | 33 | +    XLA_AVAILABLE = False | 
|  | 34 | + | 
|  | 35 | + | 
|  | 36 | +logger = logging.get_logger(__name__)  # pylint: disable=invalid-name | 
|  | 37 | + | 
|  | 38 | +EXAMPLE_DOC_STRING = """ | 
|  | 39 | +    Examples: | 
|  | 40 | +        ```py | 
|  | 41 | +        >>> import torch | 
|  | 42 | +        >>> from diffusers import FluxPipeline | 
|  | 43 | +
 | 
|  | 44 | +        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) | 
|  | 45 | +        >>> pipe.to("cuda") | 
|  | 46 | +        >>> prompt = "A cat holding a sign that says hello world" | 
|  | 47 | +        >>> # Depending on the variant being used, the pipeline call will slightly vary. | 
|  | 48 | +        >>> # Refer to the pipeline documentation for more details. | 
|  | 49 | +        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0] | 
|  | 50 | +        >>> image.save("flux.png") | 
|  | 51 | +        ``` | 
|  | 52 | +""" | 
|  | 53 | + | 
|  | 54 | + | 
|  | 55 | +class FluxPriorReduxPipeline(DiffusionPipeline): | 
|  | 56 | +    r""" | 
|  | 57 | +    The Flux pipeline for text-to-image generation. | 
|  | 58 | +
 | 
|  | 59 | +    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/ | 
|  | 60 | +
 | 
|  | 61 | +    Args: | 
|  | 62 | +        transformer ([`FluxTransformer2DModel`]): | 
|  | 63 | +            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents. | 
|  | 64 | +        scheduler ([`FlowMatchEulerDiscreteScheduler`]): | 
|  | 65 | +            A scheduler to be used in combination with `transformer` to denoise the encoded image latents. | 
|  | 66 | +        vae ([`AutoencoderKL`]): | 
|  | 67 | +            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. | 
|  | 68 | +    """ | 
|  | 69 | + | 
|  | 70 | +    model_cpu_offload_seq = "image_encoder->image_embedder" | 
|  | 71 | +    _optional_components = [] | 
|  | 72 | +    _callback_tensor_inputs = [] | 
|  | 73 | + | 
|  | 74 | +    def __init__( | 
|  | 75 | +        self, | 
|  | 76 | +        image_encoder: SiglipVisionModel, | 
|  | 77 | +        feature_extractor: SiglipImageProcessor, | 
|  | 78 | +        image_embedder: ReduxImageEncoder, | 
|  | 79 | +    ): | 
|  | 80 | +        super().__init__() | 
|  | 81 | + | 
|  | 82 | +        self.register_modules( | 
|  | 83 | +            image_encoder=image_encoder, | 
|  | 84 | +            feature_extractor=feature_extractor, | 
|  | 85 | +            image_embedder=image_embedder, | 
|  | 86 | +        ) | 
|  | 87 | + | 
|  | 88 | +    def encode_image(self, image, device, num_images_per_prompt): | 
|  | 89 | +        dtype = next(self.image_encoder.parameters()).dtype | 
|  | 90 | +        image = self.feature_extractor.preprocess( | 
|  | 91 | +            images=[image], do_resize=True, return_tensors="pt", do_convert_rgb=True | 
|  | 92 | +        ) | 
|  | 93 | +        image = image.to(device=device, dtype=dtype) | 
|  | 94 | +        image_enc_hidden_states = self.image_encoder(**image).last_hidden_state | 
|  | 95 | +        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) | 
|  | 96 | + | 
|  | 97 | +        return image_enc_hidden_states | 
|  | 98 | + | 
|  | 99 | +    @torch.no_grad() | 
|  | 100 | +    @replace_example_docstring(EXAMPLE_DOC_STRING) | 
|  | 101 | +    def __call__( | 
|  | 102 | +        self, | 
|  | 103 | +        image: PipelineImageInput, | 
|  | 104 | +        return_dict: bool = True, | 
|  | 105 | +    ): | 
|  | 106 | +        r""" | 
|  | 107 | +        Function invoked when calling the pipeline for generation. | 
|  | 108 | +
 | 
|  | 109 | +        Args: | 
|  | 110 | +            prompt (`str` or `List[str]`, *optional*): | 
|  | 111 | +                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. | 
|  | 112 | +                instead. | 
|  | 113 | +
 | 
|  | 114 | +        Examples: | 
|  | 115 | +
 | 
|  | 116 | +        Returns: | 
|  | 117 | +            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict` | 
|  | 118 | +            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated | 
|  | 119 | +            images. | 
|  | 120 | +        """ | 
|  | 121 | + | 
|  | 122 | +        # 2. Define call parameters | 
|  | 123 | +        device = self._execution_device | 
|  | 124 | + | 
|  | 125 | +        image_latents = self.encode_image(image, device, 1) | 
|  | 126 | +        image_embeds = self.image_embedder(image_latents).image_embeds | 
|  | 127 | + | 
|  | 128 | +        # Offload all models | 
|  | 129 | +        self.maybe_free_model_hooks() | 
|  | 130 | + | 
|  | 131 | +        if not return_dict: | 
|  | 132 | +            return (image_embeds,) | 
|  | 133 | + | 
|  | 134 | +        return FluxPriorReduxPipelineOutput(image_embeds=image_embeds) | 
0 commit comments