Merge pull request #360 from TobyRoseman/unet-batch-1-or-2

TobyRoseman · web-flow · commit 1c194dafb4c5 · 2024-10-04T13:57:20.000-07:00
Allow not using classifier free guidance
diff --git a/README.md b/README.md
@@ -490,6 +490,8 @@ This generally takes 15-20 minutes on an M1 MacBook Pro. Upon successful executi
 
 - `--unet-support-controlnet`: enables a converted UNet model to receive additional inputs from ControlNet. This is required for generating image with using ControlNet and saved with a different name, `*_control-unet.mlpackage`, distinct from normal UNet. On the other hand, this UNet model can not work without ControlNet. Please use normal UNet for just txt2img.
 
+- `--unet-batch-one`: use a batch size of one for the unet, this is needed if you do not want to do classifier free guidance, i.e. using a `guidance-scale` of less than one.
+
 - `--convert-vae-encoder`: not required for text-to-image applications. Required for image-to-image applications in order to map the input image to the latent space.
 
 </details>
diff --git a/python_coreml_stable_diffusion/pipeline.py b/python_coreml_stable_diffusion/pipeline.py
@@ -506,11 +506,16 @@ def __call__(
             if isinstance(latent_model_input, torch.Tensor):
                 latent_model_input = latent_model_input.numpy()
 
+            if do_classifier_free_guidance:
+                timestep = np.array([t, t], np.float16)
+            else:
+                timestep = np.array([t,], np.float16)
+
             # controlnet
             if controlnet_cond:
                 control_net_additional_residuals = self.run_controlnet(
                     sample=latent_model_input,
-                    timestep=np.array([t, t]),
+                    timestep=timestep,
                     encoder_hidden_states=text_embeddings,
                     controlnet_cond=controlnet_cond,
                 )
@@ -522,7 +527,7 @@ def __call__(
 
             noise_pred = self.unet(
                 sample=latent_model_input.astype(np.float16),
-                timestep=np.array([t, t], np.float16),
+                timestep=timestep,
                 encoder_hidden_states=text_embeddings.astype(np.float16),
                 **unet_additional_kwargs,
             )["noise_pred"]
diff --git a/python_coreml_stable_diffusion/torch2coreml.py b/python_coreml_stable_diffusion/torch2coreml.py
@@ -757,7 +757,7 @@ def forward(self, x):
     gc.collect()
 
 
-def convert_unet(pipe, args, model_name = None):
+def convert_unet(pipe, args, model_name=None):
     """ Converts the UNet component of Stable Diffusion
     """
     if args.unet_support_controlnet:
@@ -783,6 +783,8 @@ def convert_unet(pipe, args, model_name = None):
     elif not os.path.exists(out_path):
         # Prepare sample input shapes and values
         batch_size = 2  # for classifier-free guidance
+        if args.unet_batch_one:
+            batch_size = 1  # for not using classifier-free guidance
         sample_shape = (
             batch_size,                    # B
             pipe.unet.config.in_channels,  # C
@@ -1674,6 +1676,13 @@ def parser_spec():
         "If specified, enable unet to receive additional inputs from controlnet. "
         "Each input added to corresponding resnet output."
         )
+    parser.add_argument(
+        "--unet-batch-one",
+        action="store_true",
+        help=
+        "If specified, a batch size of one will be used for the unet, this is needed if you do not want to do "
+        "classifier free guidance. Default unet batch size is two, which is needed for classifier free guidance."
+        )
     parser.add_argument("--include-t5", action="store_true")
 
     # Swift CLI Resource Bundling