invoke-ai
diff --git a/‎docker/Dockerfile
Lines changed: 1 addition & 0 deletions b/‎docker/Dockerfile
Lines changed: 1 addition & 0 deletions
diff --git a/‎invokeai/app/invocations/denoise_latents.py
Lines changed: 74 additions & 23 deletions b/‎invokeai/app/invocations/denoise_latents.py
Lines changed: 74 additions & 23 deletions
diff --git a/‎invokeai/app/invocations/latents_to_image.py
Lines changed: 2 additions & 2 deletions b/‎invokeai/app/invocations/latents_to_image.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎invokeai/backend/stable_diffusion/__init__.py
Lines changed: 0 additions & 2 deletions b/‎invokeai/backend/stable_diffusion/__init__.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎invokeai/backend/stable_diffusion/extensions/inpaint.py
Lines changed: 120 additions & 0 deletions b/‎invokeai/backend/stable_diffusion/extensions/inpaint.py
Lines changed: 120 additions & 0 deletions
@@ -55,6 +55,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 FROM node:20-slim AS web-builder
 ENV PNPM_HOME="/pnpm"
 ENV PATH="$PNPM_HOME:$PATH"
+RUN corepack use [email protected]
 RUN corepack enable
 
 WORKDIR /build
 
@@ -37,9 +37,9 @@
 from invokeai.app.util.controlnet_utils import prepare_control_image
 from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
 from invokeai.backend.lora import LoRAModelRaw
-from invokeai.backend.model_manager import BaseModelType
+from invokeai.backend.model_manager import BaseModelType, ModelVariantType
 from invokeai.backend.model_patcher import ModelPatcher
-from invokeai.backend.stable_diffusion import PipelineIntermediateState, set_seamless
+from invokeai.backend.stable_diffusion import PipelineIntermediateState
 from invokeai.backend.stable_diffusion.denoise_context import DenoiseContext, DenoiseInputs
 from invokeai.backend.stable_diffusion.diffusers_pipeline import (
     ControlNetData,
@@ -60,8 +60,12 @@
 from invokeai.backend.stable_diffusion.extension_callback_type import ExtensionCallbackType
 from invokeai.backend.stable_diffusion.extensions.controlnet import ControlNetExt
 from invokeai.backend.stable_diffusion.extensions.freeu import FreeUExt
+from invokeai.backend.stable_diffusion.extensions.inpaint import InpaintExt
+from invokeai.backend.stable_diffusion.extensions.inpaint_model import InpaintModelExt
 from invokeai.backend.stable_diffusion.extensions.preview import PreviewExt
 from invokeai.backend.stable_diffusion.extensions.rescale_cfg import RescaleCFGExt
+from invokeai.backend.stable_diffusion.extensions.seamless import SeamlessExt
+from invokeai.backend.stable_diffusion.extensions.t2i_adapter import T2IAdapterExt
 from invokeai.backend.stable_diffusion.extensions_manager import ExtensionsManager
 from invokeai.backend.stable_diffusion.schedulers import SCHEDULER_MAP
 from invokeai.backend.stable_diffusion.schedulers.schedulers import SCHEDULER_NAME_VALUES
@@ -498,6 +502,33 @@ def parse_controlnet_field(
                 )
             )
 
+    @staticmethod
+    def parse_t2i_adapter_field(
+        exit_stack: ExitStack,
+        context: InvocationContext,
+        t2i_adapters: Optional[Union[T2IAdapterField, list[T2IAdapterField]]],
+        ext_manager: ExtensionsManager,
+    ) -> None:
+        if t2i_adapters is None:
+            return
+
+        # Handle the possibility that t2i_adapters could be a list or a single T2IAdapterField.
+        if isinstance(t2i_adapters, T2IAdapterField):
+            t2i_adapters = [t2i_adapters]
+
+        for t2i_adapter_field in t2i_adapters:
+            ext_manager.add_extension(
+                T2IAdapterExt(
+                    node_context=context,
+                    model_id=t2i_adapter_field.t2i_adapter_model,
+                    image=context.images.get_pil(t2i_adapter_field.image.image_name),
+                    weight=t2i_adapter_field.weight,
+                    begin_step_percent=t2i_adapter_field.begin_step_percent,
+                    end_step_percent=t2i_adapter_field.end_step_percent,
+                    resize_mode=t2i_adapter_field.resize_mode,
+                )
+            )
+
     def prep_ip_adapter_image_prompts(
         self,
         context: InvocationContext,
@@ -707,7 +738,7 @@ def prep_inpaint_mask(
         else:
             masked_latents = torch.where(mask < 0.5, 0.0, latents)
 
-        return 1 - mask, masked_latents, self.denoise_mask.gradient
+        return mask, masked_latents, self.denoise_mask.gradient
 
     @staticmethod
     def prepare_noise_and_latents(
@@ -765,10 +796,6 @@ def _new_invoke(self, context: InvocationContext) -> LatentsOutput:
         dtype = TorchDevice.choose_torch_dtype()
 
         seed, noise, latents = self.prepare_noise_and_latents(context, self.noise, self.latents)
-        latents = latents.to(device=device, dtype=dtype)
-        if noise is not None:
-            noise = noise.to(device=device, dtype=dtype)
-
         _, _, latent_height, latent_width = latents.shape
 
         conditioning_data = self.get_conditioning_data(
@@ -801,21 +828,6 @@ def _new_invoke(self, context: InvocationContext) -> LatentsOutput:
             denoising_end=self.denoising_end,
         )
 
-        denoise_ctx = DenoiseContext(
-            inputs=DenoiseInputs(
-                orig_latents=latents,
-                timesteps=timesteps,
-                init_timestep=init_timestep,
-                noise=noise,
-                seed=seed,
-                scheduler_step_kwargs=scheduler_step_kwargs,
-                conditioning_data=conditioning_data,
-                attention_processor_cls=CustomAttnProcessor2_0,
-            ),
-            unet=None,
-            scheduler=scheduler,
-        )
-
         # get the unet's config so that we can pass the base to sd_step_callback()
         unet_config = context.models.get_config(self.unet.unet.key)
 
@@ -833,13 +845,48 @@ def step_callback(state: PipelineIntermediateState) -> None:
         if self.unet.freeu_config:
             ext_manager.add_extension(FreeUExt(self.unet.freeu_config))
 
+        ### seamless
+        if self.unet.seamless_axes:
+            ext_manager.add_extension(SeamlessExt(self.unet.seamless_axes))
+
+        ### inpaint
+        mask, masked_latents, is_gradient_mask = self.prep_inpaint_mask(context, latents)
+        # NOTE: We used to identify inpainting models by inpecting the shape of the loaded UNet model weights. Now we
+        # use the ModelVariantType config. During testing, there was a report of a user with models that had an
+        # incorrect ModelVariantType value. Re-installing the model fixed the issue. If this issue turns out to be
+        # prevalent, we will have to revisit how we initialize the inpainting extensions.
+        if unet_config.variant == ModelVariantType.Inpaint:
+            ext_manager.add_extension(InpaintModelExt(mask, masked_latents, is_gradient_mask))
+        elif mask is not None:
+            ext_manager.add_extension(InpaintExt(mask, is_gradient_mask))
+
+        # Initialize context for modular denoise
+        latents = latents.to(device=device, dtype=dtype)
+        if noise is not None:
+            noise = noise.to(device=device, dtype=dtype)
+        denoise_ctx = DenoiseContext(
+            inputs=DenoiseInputs(
+                orig_latents=latents,
+                timesteps=timesteps,
+                init_timestep=init_timestep,
+                noise=noise,
+                seed=seed,
+                scheduler_step_kwargs=scheduler_step_kwargs,
+                conditioning_data=conditioning_data,
+                attention_processor_cls=CustomAttnProcessor2_0,
+            ),
+            unet=None,
+            scheduler=scheduler,
+        )
+
         # context for loading additional models
         with ExitStack() as exit_stack:
             # later should be smth like:
             # for extension_field in self.extensions:
             #    ext = extension_field.to_extension(exit_stack, context, ext_manager)
             #    ext_manager.add_extension(ext)
             self.parse_controlnet_field(exit_stack, context, self.control, ext_manager)
+            self.parse_t2i_adapter_field(exit_stack, context, self.t2i_adapter, ext_manager)
 
             # ext: t2i/ip adapter
             ext_manager.run_callback(ExtensionCallbackType.SETUP, denoise_ctx)
@@ -871,6 +918,10 @@ def _old_invoke(self, context: InvocationContext) -> LatentsOutput:
         seed, noise, latents = self.prepare_noise_and_latents(context, self.noise, self.latents)
 
         mask, masked_latents, gradient_mask = self.prep_inpaint_mask(context, latents)
+        # At this point, the mask ranges from 0 (leave unchanged) to 1 (inpaint).
+        # We invert the mask here for compatibility with the old backend implementation.
+        if mask is not None:
+            mask = 1 - mask
 
         # TODO(ryand): I have hard-coded `do_classifier_free_guidance=True` to mirror the behaviour of ControlNets,
         # below. Investigate whether this is appropriate.
@@ -915,7 +966,7 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
             ExitStack() as exit_stack,
             unet_info.model_on_device() as (model_state_dict, unet),
             ModelPatcher.apply_freeu(unet, self.unet.freeu_config),
-            set_seamless(unet, self.unet.seamless_axes),  # FIXME
+            SeamlessExt.static_patch_model(unet, self.unet.seamless_axes),  # FIXME
             # Apply the LoRA after unet has been moved to its target device for faster patching.
             ModelPatcher.apply_lora_unet(
                 unet,
 
@@ -24,7 +24,7 @@
 from invokeai.app.invocations.model import VAEField
 from invokeai.app.invocations.primitives import ImageOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
-from invokeai.backend.stable_diffusion import set_seamless
+from invokeai.backend.stable_diffusion.extensions.seamless import SeamlessExt
 from invokeai.backend.stable_diffusion.vae_tiling import patch_vae_tiling_params
 from invokeai.backend.util.devices import TorchDevice
 
@@ -59,7 +59,7 @@ def invoke(self, context: InvocationContext) -> ImageOutput:
 
         vae_info = context.models.load(self.vae.vae)
         assert isinstance(vae_info.model, (AutoencoderKL, AutoencoderTiny))
-        with set_seamless(vae_info.model, self.vae.seamless_axes), vae_info as vae:
+        with SeamlessExt.static_patch_model(vae_info.model, self.vae.seamless_axes), vae_info as vae:
             assert isinstance(vae, (AutoencoderKL, AutoencoderTiny))
             latents = latents.to(vae.device)
             if self.fp32:
 
@@ -7,11 +7,9 @@
     StableDiffusionGeneratorPipeline,
 )
 from invokeai.backend.stable_diffusion.diffusion import InvokeAIDiffuserComponent  # noqa: F401
-from invokeai.backend.stable_diffusion.seamless import set_seamless  # noqa: F401
 
 __all__ = [
     "PipelineIntermediateState",
     "StableDiffusionGeneratorPipeline",
     "InvokeAIDiffuserComponent",
-    "set_seamless",
 ]
@@ -0,0 +1,120 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+import einops
+import torch
+from diffusers import UNet2DConditionModel
+
+from invokeai.backend.stable_diffusion.extension_callback_type import ExtensionCallbackType
+from invokeai.backend.stable_diffusion.extensions.base import ExtensionBase, callback
+
+if TYPE_CHECKING:
+    from invokeai.backend.stable_diffusion.denoise_context import DenoiseContext
+
+
+class InpaintExt(ExtensionBase):
+    """An extension for inpainting with non-inpainting models. See `InpaintModelExt` for inpainting with inpainting
+    models.
+    """
+
+    def __init__(
+        self,
+        mask: torch.Tensor,
+        is_gradient_mask: bool,
+    ):
+        """Initialize InpaintExt.
+        Args:
+            mask (torch.Tensor): The inpainting mask. Shape: (1, 1, latent_height, latent_width). Values are
+                expected to be in the range [0, 1]. A value of 1 means that the corresponding 'pixel' should not be
+                inpainted.
+            is_gradient_mask (bool): If True, mask is interpreted as a gradient mask meaning that the mask values range
+                from 0 to 1. If False, mask is interpreted as binary mask meaning that the mask values are either 0 or
+                1.
+        """
+        super().__init__()
+        self._mask = mask
+        self._is_gradient_mask = is_gradient_mask
+
+        # Noise, which used to noisify unmasked part of image
+        # if noise provided to context, then it will be used
+        # if no noise provided, then noise will be generated based on seed
+        self._noise: Optional[torch.Tensor] = None
+
+    @staticmethod
+    def _is_normal_model(unet: UNet2DConditionModel):
+        """Checks if the provided UNet belongs to a regular model.
+        The `in_channels` of a UNet vary depending on model type:
+        - normal - 4
+        - depth - 5
+        - inpaint - 9
+        """
+        return unet.conv_in.in_channels == 4
+
+    def _apply_mask(self, ctx: DenoiseContext, latents: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        batch_size = latents.size(0)
+        mask = einops.repeat(self._mask, "b c h w -> (repeat b) c h w", repeat=batch_size)
+        if t.dim() == 0:
+            # some schedulers expect t to be one-dimensional.
+            # TODO: file diffusers bug about inconsistency?
+            t = einops.repeat(t, "-> batch", batch=batch_size)
+        # Noise shouldn't be re-randomized between steps here. The multistep schedulers
+        # get very confused about what is happening from step to step when we do that.
+        mask_latents = ctx.scheduler.add_noise(ctx.inputs.orig_latents, self._noise, t)
+        # TODO: Do we need to also apply scheduler.scale_model_input? Or is add_noise appropriately scaled already?
+        # mask_latents = self.scheduler.scale_model_input(mask_latents, t)
+        mask_latents = einops.repeat(mask_latents, "b c h w -> (repeat b) c h w", repeat=batch_size)
+        if self._is_gradient_mask:
+            threshold = (t.item()) / ctx.scheduler.config.num_train_timesteps
+            mask_bool = mask < 1 - threshold
+            masked_input = torch.where(mask_bool, latents, mask_latents)
+        else:
+            masked_input = torch.lerp(latents, mask_latents.to(dtype=latents.dtype), mask.to(dtype=latents.dtype))
+        return masked_input
+
+    @callback(ExtensionCallbackType.PRE_DENOISE_LOOP)
+    def init_tensors(self, ctx: DenoiseContext):
+        if not self._is_normal_model(ctx.unet):
+            raise ValueError(
+                "InpaintExt should be used only on normal (non-inpainting) models. This could be caused by an "
+                "inpainting model that was incorrectly marked as a non-inpainting model. In some cases, this can be "
+                "fixed by removing and re-adding the model (so that it gets re-probed)."
+            )
+
+        self._mask = self._mask.to(device=ctx.latents.device, dtype=ctx.latents.dtype)
+
+        self._noise = ctx.inputs.noise
+        # 'noise' might be None if the latents have already been noised (e.g. when running the SDXL refiner).
+        # We still need noise for inpainting, so we generate it from the seed here.
+        if self._noise is None:
+            self._noise = torch.randn(
+                ctx.latents.shape,
+                dtype=torch.float32,
+                device="cpu",
+                generator=torch.Generator(device="cpu").manual_seed(ctx.seed),
+            ).to(device=ctx.latents.device, dtype=ctx.latents.dtype)
+
+    # Use negative order to make extensions with default order work with patched latents
+    @callback(ExtensionCallbackType.PRE_STEP, order=-100)
+    def apply_mask_to_initial_latents(self, ctx: DenoiseContext):
+        ctx.latents = self._apply_mask(ctx, ctx.latents, ctx.timestep)
+
+    # TODO: redo this with preview events rewrite
+    # Use negative order to make extensions with default order work with patched latents
+    @callback(ExtensionCallbackType.POST_STEP, order=-100)
+    def apply_mask_to_step_output(self, ctx: DenoiseContext):
+        timestep = ctx.scheduler.timesteps[-1]
+        if hasattr(ctx.step_output, "denoised"):
+            ctx.step_output.denoised = self._apply_mask(ctx, ctx.step_output.denoised, timestep)
+        elif hasattr(ctx.step_output, "pred_original_sample"):
+            ctx.step_output.pred_original_sample = self._apply_mask(ctx, ctx.step_output.pred_original_sample, timestep)
+        else:
+            ctx.step_output.pred_original_sample = self._apply_mask(ctx, ctx.step_output.prev_sample, timestep)
+
+    # Restore unmasked part after the last step is completed
+    @callback(ExtensionCallbackType.POST_DENOISE_LOOP)
+    def restore_unmasked(self, ctx: DenoiseContext):
+        if self._is_gradient_mask:
+            ctx.latents = torch.where(self._mask < 1, ctx.latents, ctx.inputs.orig_latents)
+        else:
+            ctx.latents = torch.lerp(ctx.latents, ctx.inputs.orig_latents, self._mask)
Original file line number	Diff line number	Diff line change
`@@ -7,11 +7,9 @@`
`7`	`7`	`StableDiffusionGeneratorPipeline,`
`8`	`8`	`)`
`9`	`9`	`from invokeai.backend.stable_diffusion.diffusion import InvokeAIDiffuserComponent # noqa: F401`
`10`		`-from invokeai.backend.stable_diffusion.seamless import set_seamless # noqa: F401`
`11`	`10`
`12`	`11`	`__all__ = [`
`13`	`12`	`"PipelineIntermediateState",`
`14`	`13`	`"StableDiffusionGeneratorPipeline",`
`15`	`14`	`"InvokeAIDiffuserComponent",`
`16`		`- "set_seamless",`
`17`	`15`	`]`