Add option for sequential unet predictions

TobyRoseman · TobyRoseman · commit 7505f91208aa · 2024-10-23T14:23:19.000-07:00
diff --git a/README.md b/README.md
@@ -514,6 +514,7 @@ Please refer to the help menu for all available arguments: `python -m python_cor
 - `--scheduler`: If you would like to experiment with different schedulers, you may specify it here. For available options, please see the help menu. You may also specify a custom number of inference steps by `--num-inference-steps` which defaults to 50.
 - `--controlnet`: ControlNet models specified with this option are used in image generation. Use this option in the format `--controlnet lllyasviel/sd-controlnet-mlsd lllyasviel/sd-controlnet-depth` and make sure to use `--controlnet-inputs` in conjunction.
 - `--controlnet-inputs`: Image inputs corresponding to each ControlNet model. Please provide image paths in same order as models in `--controlnet`, for example: `--controlnet-inputs image_mlsd image_depth`.
+- `--unet-batch-one`: Do not batch unet predictions for the prompt and negative prompt. This requires the unet has been converted with a batch size of one, see `--unet-batch-one` option in conversion script.
 
 </details>
 
diff --git a/python_coreml_stable_diffusion/pipeline.py b/python_coreml_stable_diffusion/pipeline.py
@@ -416,9 +416,10 @@ def __call__(
             callback=None,
             callback_steps=1,
             controlnet_cond=None,
-            original_size: Optional[Tuple[int, int]] = None,
-            crops_coords_top_left: Tuple[int, int] = (0, 0),
-            target_size: Optional[Tuple[int, int]] = None,
+            original_size: Optional[Tuple[int, int]]=None,
+            crops_coords_top_left: Tuple[int, int]=(0, 0),
+            target_size: Optional[Tuple[int, int]]=None,
+            unet_batch_one=False,
             **kwargs,
     ):
         # 1. Check inputs. Raise error if not correct
@@ -525,16 +526,38 @@ def __call__(
             # predict the noise residual
             unet_additional_kwargs.update(control_net_additional_residuals)
 
-            noise_pred = self.unet(
-                sample=latent_model_input.astype(np.float16),
-                timestep=timestep,
-                encoder_hidden_states=text_embeddings.astype(np.float16),
-                **unet_additional_kwargs,
-            )["noise_pred"]
+            # get prediction from unet
+            if not (unet_batch_one and do_classifier_free_guidance):
+                noise_pred = self.unet(
+                    sample=latent_model_input.astype(np.float16),
+                    timestep=timestep,
+                    encoder_hidden_states=text_embeddings.astype(np.float16),
+                    **unet_additional_kwargs,
+                )["noise_pred"]
+
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+            else:
+                # query unet sequentially
+                latent_model_input = latent_model_input.astype(np.float16)
+                text_embeddings = text_embeddings.astype(np.float16)
+                timestep = np.array([t,], np.float16)
+
+                noise_pred_uncond = self.unet(
+                    sample=np.expand_dims(latent_model_input[0], axis=0),
+                    timestep=timestep,
+                    encoder_hidden_states=np.expand_dims(text_embeddings[0], axis=0),
+                    **unet_additional_kwargs,
+                )["noise_pred"]
+                noise_pred_text = self.unet(
+                    sample=np.expand_dims(latent_model_input[1], axis=0),
+                    timestep=timestep,
+                    encoder_hidden_states=np.expand_dims(text_embeddings[1], axis=0),
+                    **unet_additional_kwargs,
+                )["noise_pred"]
 
             # perform guidance
             if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
                 noise_pred = noise_pred_uncond + guidance_scale * (
                         noise_pred_text - noise_pred_uncond)
 
@@ -751,6 +774,7 @@ def main(args):
         guidance_scale=args.guidance_scale,
         controlnet_cond=controlnet_cond,
         negative_prompt=args.negative_prompt,
+        unet_batch_one=args.unet_batch_one,
     )
 
     out_path = get_image_path(args)
@@ -821,6 +845,10 @@ def main(args):
         "--negative-prompt",
         default=None,
         help="The negative text prompt to be used for text-to-image generation.")
+    parser.add_argument(
+        "--unet-batch-one",
+        action="store_true",
+        help="Do not batch unet predictions for the prompt and negative prompt.")
     parser.add_argument('--model-sources',
                         default=None,
                         choices=['packages', 'compiled'],