Skip to content

Commit 016a6f1

Browse files
Make T2I Adapters work with any resolution supported by the models (#7215)
## Summary This change mimics the unet padding strategy to align T2I featuremaps with the latents during denoising. It also slightly adjusts the crop and scale logic so that the control will match the input image without shifting when it needs to pad. ## Related Issues / Discussions <!--WHEN APPLICABLE: List any related issues or discussions on github or discord. If this PR closes an issue, please use the "Closes #1234" format, so that the issue will be automatically closed when the PR merges.--> ## QA Instructions Image generated at 1032x1024 ![image](https://github.com/user-attachments/assets/7ea579e4-61dc-4b6b-aa84-33d676d160c6) Image generated at 1080x1040 to prove feature alignment. ![image](https://github.com/user-attachments/assets/ee6e5b6a-d0d5-474d-9fc4-f65c104964bd) Edge artifacts on the bottom and right are a result of SDXL's unet padding, and t2i influence will be cut off in those regions. ## Merge Plan Contingent on #7205 Currently the Canvas UI prevents users from generating non-64 resolutions while t2i adapter layers are active. Will leave this as a draft until fixing that. ## Checklist - [x] _The PR has a short but descriptive title, suitable for a changelog_ - [ ] _Tests added / updated (if applicable)_ - [ ] _Documentation added / updated (if applicable)_
2 parents 26f95d6 + 6fbc019 commit 016a6f1

File tree

3 files changed

+35
-49
lines changed

3 files changed

+35
-49
lines changed

invokeai/app/invocations/denoise_latents.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,7 @@ def run_t2i_adapters(
622622
for t2i_adapter_field in t2i_adapter:
623623
t2i_adapter_model_config = context.models.get_config(t2i_adapter_field.t2i_adapter_model.key)
624624
t2i_adapter_loaded_model = context.models.load(t2i_adapter_field.t2i_adapter_model)
625-
image = context.images.get_pil(t2i_adapter_field.image.image_name)
625+
image = context.images.get_pil(t2i_adapter_field.image.image_name, mode="RGB")
626626

627627
# The max_unet_downscale is the maximum amount that the UNet model downscales the latent image internally.
628628
if t2i_adapter_model_config.base == BaseModelType.StableDiffusion1:
@@ -640,29 +640,39 @@ def run_t2i_adapters(
640640
with t2i_adapter_loaded_model as t2i_adapter_model:
641641
total_downscale_factor = t2i_adapter_model.total_downscale_factor
642642

643-
# Resize the T2I-Adapter input image.
644-
# We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
645-
# result will match the latent image's dimensions after max_unet_downscale is applied.
646-
t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
647-
t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
648-
649643
# Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare
650644
# a single image. If CFG is enabled, we will duplicate the resultant tensor after applying the
651645
# T2I-Adapter model.
652646
#
653647
# Note: We re-use the `prepare_control_image(...)` from ControlNet for T2I-Adapter, because it has many
654648
# of the same requirements (e.g. preserving binary masks during resize).
649+
650+
# Assuming fixed dimensional scaling of LATENT_SCALE_FACTOR.
651+
_, _, latent_height, latent_width = latents_shape
652+
control_height_resize = latent_height * LATENT_SCALE_FACTOR
653+
control_width_resize = latent_width * LATENT_SCALE_FACTOR
655654
t2i_image = prepare_control_image(
656655
image=image,
657656
do_classifier_free_guidance=False,
658-
width=t2i_input_width,
659-
height=t2i_input_height,
657+
width=control_width_resize,
658+
height=control_height_resize,
660659
num_channels=t2i_adapter_model.config["in_channels"], # mypy treats this as a FrozenDict
661660
device=t2i_adapter_model.device,
662661
dtype=t2i_adapter_model.dtype,
663662
resize_mode=t2i_adapter_field.resize_mode,
664663
)
665664

665+
# Resize the T2I-Adapter input image.
666+
# We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
667+
# result will match the latent image's dimensions after max_unet_downscale is applied.
668+
# We crop the image to this size so that the positions match the input image on non-standard resolutions
669+
t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
670+
t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
671+
if t2i_image.shape[2] > t2i_input_height or t2i_image.shape[3] > t2i_input_width:
672+
t2i_image = t2i_image[
673+
:, :, : min(t2i_image.shape[2], t2i_input_height), : min(t2i_image.shape[3], t2i_input_width)
674+
]
675+
666676
adapter_state = t2i_adapter_model(t2i_image)
667677

668678
if do_classifier_free_guidance:

invokeai/backend/stable_diffusion/diffusers_pipeline.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,22 @@ def step(
499499
for idx, value in enumerate(single_t2i_adapter_data.adapter_state):
500500
accum_adapter_state[idx] += value * t2i_adapter_weight
501501

502+
# Hack: force compatibility with irregular resolutions by padding the feature map with zeros
503+
for idx, tensor in enumerate(accum_adapter_state):
504+
# The tensor size is supposed to be some integer downscale factor of the latents size.
505+
# Internally, the unet will pad the latents before downscaling between levels when it is no longer divisible by its downscale factor.
506+
# If the latent size does not scale down evenly, we need to pad the tensor so that it matches the the downscaled padded latents later on.
507+
scale_factor = latents.size()[-1] // tensor.size()[-1]
508+
required_padding_width = math.ceil(latents.size()[-1] / scale_factor) - tensor.size()[-1]
509+
required_padding_height = math.ceil(latents.size()[-2] / scale_factor) - tensor.size()[-2]
510+
tensor = torch.nn.functional.pad(
511+
tensor,
512+
(0, required_padding_width, 0, required_padding_height, 0, 0, 0, 0),
513+
mode="constant",
514+
value=0,
515+
)
516+
accum_adapter_state[idx] = tensor
517+
502518
down_intrablock_additional_residuals = accum_adapter_state
503519

504520
# Handle inpainting models.

invokeai/frontend/web/src/common/hooks/useIsReadyToEnqueue.ts

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -202,46 +202,6 @@ const createSelector = (
202202
if (controlLayer.controlAdapter.model?.base !== model?.base) {
203203
problems.push(i18n.t('parameters.invoke.layer.controlAdapterIncompatibleBaseModel'));
204204
}
205-
// T2I Adapters require images have dimensions that are multiples of 64 (SD1.5) or 32 (SDXL)
206-
if (controlLayer.controlAdapter.type === 't2i_adapter') {
207-
const multiple = model?.base === 'sdxl' ? 32 : 64;
208-
if (bbox.scaleMethod === 'none') {
209-
if (bbox.rect.width % 16 !== 0) {
210-
reasons.push({
211-
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxWidth', {
212-
multiple,
213-
width: bbox.rect.width,
214-
}),
215-
});
216-
}
217-
if (bbox.rect.height % 16 !== 0) {
218-
reasons.push({
219-
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxHeight', {
220-
multiple,
221-
height: bbox.rect.height,
222-
}),
223-
});
224-
}
225-
} else {
226-
if (bbox.scaledSize.width % 16 !== 0) {
227-
reasons.push({
228-
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxWidth', {
229-
multiple,
230-
width: bbox.scaledSize.width,
231-
}),
232-
});
233-
}
234-
if (bbox.scaledSize.height % 16 !== 0) {
235-
reasons.push({
236-
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxHeight', {
237-
multiple,
238-
height: bbox.scaledSize.height,
239-
}),
240-
});
241-
}
242-
}
243-
}
244-
245205
if (problems.length) {
246206
const content = upperFirst(problems.join(', '));
247207
reasons.push({ prefix, content });

0 commit comments

Comments
 (0)