Make T2I Adapters work with any resolution supported by the models (#7215)

psychedelicious · web-flow · commit 016a6f182fcc · 2024-11-01T13:22:00.000+11:00
## Summary This change mimics the unet padding strategy to align T2I featuremaps with the latents during denoising. It also slightly adjusts the crop and scale logic so that the control will match the input image without shifting when it needs to pad. ## Related Issues / Discussions  ## QA Instructions Image generated at 1032x1024 ![image](https://github.com/user-attachments/assets/7ea579e4-61dc-4b6b-aa84-33d676d160c6) Image generated at 1080x1040 to prove feature alignment. ![image](https://github.com/user-attachments/assets/ee6e5b6a-d0d5-474d-9fc4-f65c104964bd) Edge artifacts on the bottom and right are a result of SDXL's unet padding, and t2i influence will be cut off in those regions. ## Merge Plan Contingent on #7205 Currently the Canvas UI prevents users from generating non-64 resolutions while t2i adapter layers are active. Will leave this as a draft until fixing that. ## Checklist - [x] _The PR has a short but descriptive title, suitable for a changelog_ - [ ] _Tests added / updated (if applicable)_ - [ ] _Documentation added / updated (if applicable)_
diff --git a/invokeai/app/invocations/denoise_latents.py b/invokeai/app/invocations/denoise_latents.py
@@ -622,7 +622,7 @@ def run_t2i_adapters(
         for t2i_adapter_field in t2i_adapter:
             t2i_adapter_model_config = context.models.get_config(t2i_adapter_field.t2i_adapter_model.key)
             t2i_adapter_loaded_model = context.models.load(t2i_adapter_field.t2i_adapter_model)
-            image = context.images.get_pil(t2i_adapter_field.image.image_name)
+            image = context.images.get_pil(t2i_adapter_field.image.image_name, mode="RGB")
 
             # The max_unet_downscale is the maximum amount that the UNet model downscales the latent image internally.
             if t2i_adapter_model_config.base == BaseModelType.StableDiffusion1:
@@ -640,29 +640,39 @@ def run_t2i_adapters(
             with t2i_adapter_loaded_model as t2i_adapter_model:
                 total_downscale_factor = t2i_adapter_model.total_downscale_factor
 
-                # Resize the T2I-Adapter input image.
-                # We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
-                # result will match the latent image's dimensions after max_unet_downscale is applied.
-                t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
-                t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
-
                 # Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare
                 # a single image. If CFG is enabled, we will duplicate the resultant tensor after applying the
                 # T2I-Adapter model.
                 #
                 # Note: We re-use the `prepare_control_image(...)` from ControlNet for T2I-Adapter, because it has many
                 # of the same requirements (e.g. preserving binary masks during resize).
+
+                # Assuming fixed dimensional scaling of LATENT_SCALE_FACTOR.
+                _, _, latent_height, latent_width = latents_shape
+                control_height_resize = latent_height * LATENT_SCALE_FACTOR
+                control_width_resize = latent_width * LATENT_SCALE_FACTOR
                 t2i_image = prepare_control_image(
                     image=image,
                     do_classifier_free_guidance=False,
-                    width=t2i_input_width,
-                    height=t2i_input_height,
+                    width=control_width_resize,
+                    height=control_height_resize,
                     num_channels=t2i_adapter_model.config["in_channels"],  # mypy treats this as a FrozenDict
                     device=t2i_adapter_model.device,
                     dtype=t2i_adapter_model.dtype,
                     resize_mode=t2i_adapter_field.resize_mode,
                 )
 
+                # Resize the T2I-Adapter input image.
+                # We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
+                # result will match the latent image's dimensions after max_unet_downscale is applied.
+                # We crop the image to this size so that the positions match the input image on non-standard resolutions
+                t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
+                t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
+                if t2i_image.shape[2] > t2i_input_height or t2i_image.shape[3] > t2i_input_width:
+                    t2i_image = t2i_image[
+                        :, :, : min(t2i_image.shape[2], t2i_input_height), : min(t2i_image.shape[3], t2i_input_width)
+                    ]
+
                 adapter_state = t2i_adapter_model(t2i_image)
 
             if do_classifier_free_guidance:
diff --git a/invokeai/backend/stable_diffusion/diffusers_pipeline.py b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@@ -499,6 +499,22 @@ def step(
                     for idx, value in enumerate(single_t2i_adapter_data.adapter_state):
                         accum_adapter_state[idx] += value * t2i_adapter_weight
 
+            # Hack: force compatibility with irregular resolutions by padding the feature map with zeros
+            for idx, tensor in enumerate(accum_adapter_state):
+                # The tensor size is supposed to be some integer downscale factor of the latents size.
+                # Internally, the unet will pad the latents before downscaling between levels when it is no longer divisible by its downscale factor.
+                # If the latent size does not scale down evenly, we need to pad the tensor so that it matches the the downscaled padded latents later on.
+                scale_factor = latents.size()[-1] // tensor.size()[-1]
+                required_padding_width = math.ceil(latents.size()[-1] / scale_factor) - tensor.size()[-1]
+                required_padding_height = math.ceil(latents.size()[-2] / scale_factor) - tensor.size()[-2]
+                tensor = torch.nn.functional.pad(
+                    tensor,
+                    (0, required_padding_width, 0, required_padding_height, 0, 0, 0, 0),
+                    mode="constant",
+                    value=0,
+                )
+                accum_adapter_state[idx] = tensor
+
             down_intrablock_additional_residuals = accum_adapter_state
 
         # Handle inpainting models.
diff --git a/invokeai/frontend/web/src/common/hooks/useIsReadyToEnqueue.ts b/invokeai/frontend/web/src/common/hooks/useIsReadyToEnqueue.ts
@@ -202,46 +202,6 @@ const createSelector = (
             if (controlLayer.controlAdapter.model?.base !== model?.base) {
               problems.push(i18n.t('parameters.invoke.layer.controlAdapterIncompatibleBaseModel'));
             }
-            // T2I Adapters require images have dimensions that are multiples of 64 (SD1.5) or 32 (SDXL)
-            if (controlLayer.controlAdapter.type === 't2i_adapter') {
-              const multiple = model?.base === 'sdxl' ? 32 : 64;
-              if (bbox.scaleMethod === 'none') {
-                if (bbox.rect.width % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxWidth', {
-                      multiple,
-                      width: bbox.rect.width,
-                    }),
-                  });
-                }
-                if (bbox.rect.height % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxHeight', {
-                      multiple,
-                      height: bbox.rect.height,
-                    }),
-                  });
-                }
-              } else {
-                if (bbox.scaledSize.width % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxWidth', {
-                      multiple,
-                      width: bbox.scaledSize.width,
-                    }),
-                  });
-                }
-                if (bbox.scaledSize.height % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxHeight', {
-                      multiple,
-                      height: bbox.scaledSize.height,
-                    }),
-                  });
-                }
-              }
-            }
-
             if (problems.length) {
               const content = upperFirst(problems.join(', '));
               reasons.push({ prefix, content });