@@ -62,8 +62,6 @@ def setup(self, ctx: DenoiseContext):
62
62
image = self ._image ,
63
63
latents_height = latents_height ,
64
64
latents_width = latents_width ,
65
- max_unet_downscale = self ._max_unet_downscale ,
66
- resize_mode = self ._resize_mode ,
67
65
)
68
66
69
67
def _run_model (
@@ -72,21 +70,28 @@ def _run_model(
72
70
image : Image ,
73
71
latents_height : int ,
74
72
latents_width : int ,
75
- max_unet_downscale : int ,
76
- resize_mode : CONTROLNET_RESIZE_VALUES ,
77
73
):
78
- input_height = latents_height // max_unet_downscale * model .total_downscale_factor
79
- input_width = latents_width // max_unet_downscale * model .total_downscale_factor
80
-
74
+ # Resize the T2I-Adapter input image.
75
+ # We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
76
+ # result will match the latent image's dimensions after max_unet_downscale is applied.
77
+ input_height = latents_height // self ._max_unet_downscale * model .total_downscale_factor
78
+ input_width = latents_width // self ._max_unet_downscale * model .total_downscale_factor
79
+
80
+ # Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare
81
+ # a single image. If CFG is enabled, we will duplicate the resultant tensor after applying the
82
+ # T2I-Adapter model.
83
+ #
84
+ # Note: We re-use the `prepare_control_image(...)` from ControlNet for T2I-Adapter, because it has many
85
+ # of the same requirements (e.g. preserving binary masks during resize).
81
86
t2i_image = prepare_control_image (
82
87
image = image ,
83
88
do_classifier_free_guidance = False ,
84
89
width = input_width ,
85
90
height = input_height ,
86
- num_channels = model .config ["in_channels" ], # mypy treats this as a FrozenDict
91
+ num_channels = model .config ["in_channels" ],
87
92
device = model .device ,
88
93
dtype = model .dtype ,
89
- resize_mode = resize_mode ,
94
+ resize_mode = self . _resize_mode ,
90
95
)
91
96
92
97
return model (t2i_image )
0 commit comments