Fixes for vae precision/attn decomposition, numerics validation

monorimet · monorimet · commit fd2a2ba40e21 · 2024-06-18T16:00:32.000-05:00
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -247,6 +247,12 @@ def is_valid_file(arg):
     default="fp16",
     help="Precision of Stable Diffusion weights and graph.",
 )
+p.add_argument(
+    "--vae_precision",
+    type=str,
+    default=None,
+    help="Precision of Stable Diffusion VAE weights and graph.",
+)
 p.add_argument(
     "--max_length", type=int, default=77, help="Sequence Length of Stable Diffusion"
 )
@@ -257,7 +263,7 @@ def is_valid_file(arg):
 p.add_argument(
     "--vae_decomp_attn",
     type=bool,
-    default=True,
+    default=False,
     help="Decompose attention for VAE decode only at fx graph level",
 )
 p.add_argument(
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -46,7 +46,6 @@ def __init__(
         hf_model_name: str,
         height: int,
         width: int,
-        shift: float,
         precision: str,
         max_length: int,
         batch_size: int,
@@ -59,10 +58,12 @@ def __init__(
         pipeline_dir: str = "./shark_vmfbs",
         external_weights_dir: str = "./shark_weights",
         external_weights: str = "safetensors",
-        vae_decomp_attn: bool = True,
-        custom_vae: str = "",
+        vae_decomp_attn: bool = False,
         cpu_scheduling: bool = False,
+        vae_precision: str = "fp32",
         scheduler_id: str = None, #compatibility only, always uses EulerFlowScheduler
+        shift: float = 1.0,
+
     ):
         self.hf_model_name = hf_model_name
         # self.scheduler_id = scheduler_id
@@ -120,10 +121,11 @@ def __init__(
         self.external_weights_dir = external_weights_dir
         self.external_weights = external_weights
         self.vae_decomp_attn = vae_decomp_attn
-        self.custom_vae = custom_vae
+        self.custom_vae = None
         self.cpu_scheduling = cpu_scheduling
         self.torch_dtype = torch.float32 if self.precision == "fp32" else torch.float16
-        self.vae_dtype = torch.float32
+        self.vae_precision = vae_precision if vae_precision else self.precision
+        self.vae_dtype = torch.float32 if vae_precision == "fp32" else torch.float16
         # TODO: set this based on user-inputted guidance scale and negative prompt.
         self.do_classifier_free_guidance = True  # False if any(x in hf_model_name for x in ["turbo", "lightning"]) else True
 
@@ -206,7 +208,12 @@ def is_prepared(self, vmfbs, weights):
             )
             if w_key == "clip":
                 default_name = os.path.join(
-                    self.external_weights_dir, f"sd3_clip_fp16.irpa"
+                    self.external_weights_dir, f"sd3_text_encoders_{self.precision}.irpa"
+                )
+            if w_key == "mmdit":
+                default_name = os.path.join(
+                    self.external_weights_dir,
+                    f"sd3_mmdit_{self.precision}." + self.external_weights,
                 )
             if weights[w_key] is None and os.path.exists(default_name):
                 weights[w_key] = os.path.join(default_name)
@@ -357,7 +364,7 @@ def export_submodel(
                     self.batch_size,
                     self.height,
                     self.width,
-                    "fp32",
+                    self.vae_precision,
                     "vmfb",
                     self.external_weights,
                     vae_external_weight_path,
@@ -586,7 +593,8 @@ def generate_images(
                     dtype=self.vae_dtype,
                 )
             else:
-                latents = sample.astype("float32")
+                vae_numpy_dtype = np.float32 if self.vae_precision == "fp32" else np.float16
+                latents = sample.astype(vae_numpy_dtype)
 
             vae_start = time.time()
             vae_out = self.runners["vae"].ctx.modules.compiled_vae["decode"](latents)
@@ -634,7 +642,7 @@ def generate_images(
             out_image = Image.fromarray(image)
             images.extend([[out_image]])
         if return_imgs:
-            return images
+            return images[0]
         for idx_batch, image_batch in enumerate(images):
             for idx, image in enumerate(image_batch):
                 img_path = (
@@ -767,7 +775,6 @@ def run_diffusers_cpu(
         args.hf_model_name,
         args.height,
         args.width,
-        args.shift,
         args.precision,
         args.max_length,
         args.batch_size,
@@ -779,9 +786,8 @@ def run_diffusers_cpu(
         args.decomp_attn,
         args.pipeline_dir,
         args.external_weights_dir,
-        args.external_weights,
-        args.vae_decomp_attn,
-        custom_vae=None,
+        external_weights=args.external_weights,
+        vae_decomp_attn=args.vae_decomp_attn,
         cpu_scheduling=args.cpu_scheduling,
         vae_precision=args.vae_precision,
     )
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae_runner.py
@@ -15,8 +15,8 @@ def run_vae(
 ):
     runner = vmfbRunner(device, vmfb_path, external_weight_path)
     inputs = [ireert.asdevicearray(runner.config.device, example_input)]
-    results = runner.ctx.modules.compiled_vae["decode"](*inputs)
-
+    results = runner.ctx.modules.compiled_vae["decode"](*inputs).to_host()
+    results = imagearray_from_vae_out(results)
     return results
 
 
@@ -32,11 +32,19 @@ def run_torch_vae(hf_model_name, variant, example_input):
     elif variant == "encode":
         results = vae_model.encode(example_input)
     np_torch_output = results.detach().cpu().numpy()
+    np_torch_output = imagearray_from_vae_out(np_torch_output)  
     return np_torch_output
 
+def imagearray_from_vae_out(image):
+    if image.ndim == 4:
+        image = image[0]
+    image = torch.from_numpy(image).cpu().permute(1, 2, 0).float().numpy()
+    image = (image * 255).round().astype("uint8")
+    return image
 
 if __name__ == "__main__":
     from turbine_models.custom_models.sd3_inference.sd3_cmd_opts import args
+    import numpy as np
 
     dtype = torch.float16 if args.precision == "fp16" else torch.float32
     if args.vae_variant == "decode":
@@ -57,9 +65,9 @@ def run_torch_vae(hf_model_name, variant, example_input):
     )
     print(
         "TURBINE OUTPUT:",
-        turbine_results.to_host(),
-        turbine_results.to_host().shape,
-        turbine_results.to_host().dtype,
+        turbine_results,
+        turbine_results.shape,
+        turbine_results.dtype,
     )
     if args.compare_vs_torch:
         print("generating torch output: ")
@@ -69,9 +77,10 @@ def run_torch_vae(hf_model_name, variant, example_input):
             args.hf_model_name, args.vae_variant, example_input.float()
         )
         print("TORCH OUTPUT:", torch_output, torch_output.shape, torch_output.dtype)
-        err = utils.largest_error(torch_output, turbine_results)
-        print("Largest Error: ", err)
-        assert err < 2e-3
+        # Allow a small amount of wiggle room for rounding errors (1)
+        np.testing.assert_allclose(
+            turbine_results, torch_output, rtol=1, atol=1
+        )
 
     # TODO: Figure out why we occasionally segfault without unlinking output variables
     turbine_results = None
diff --git a/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py b/models/turbine_models/custom_models/sd3_inference/text_encoder_impls.py
@@ -341,8 +341,10 @@ def __init__(self):
         self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
         self.t5xxl = T5XXLTokenizer()
 
-    def tokenize_with_weights(self, text: str):
+    def tokenize_with_weights(self, text: str | list[str]):
         out = {}
+        if isinstance(text, list):
+            text = text[0]
         out["g"] = self.clip_g.tokenize_with_weights(text)
         out["l"] = self.clip_l.tokenize_with_weights(text)
         out["t5xxl"] = self.t5xxl.tokenize_with_weights(text)
diff --git a/models/turbine_models/custom_models/sdxl_inference/unet_runner.py b/models/turbine_models/custom_models/sdxl_inference/unet_runner.py
@@ -31,9 +31,8 @@ def run_unet(
         ireert.asdevicearray(runner.config.device, prompt_embeds),
         ireert.asdevicearray(runner.config.device, text_embeds),
         ireert.asdevicearray(runner.config.device, time_ids),
-        ireert.asdevicearray(runner.config.device, guidance_scale),
     ]
-    results = runner.ctx.modules.compiled_unet["main"](*inputs)
+    results = runner.ctx.modules.compiled_unet["run_forward"](*inputs)
 
     return results
 
@@ -57,7 +56,6 @@ def run_unet_steps(
         ireert.asdevicearray(runner.config.device, prompt_embeds),
         ireert.asdevicearray(runner.config.device, text_embeds),
         ireert.asdevicearray(runner.config.device, time_ids),
-        ireert.asdevicearray(runner.config.device, (guidance_scale,)),
     ]
     for i, t in tqdm(enumerate(scheduler.timesteps)):
         timestep = t
@@ -69,7 +67,7 @@ def run_unet_steps(
         inputs[1] = timestep = ireert.asdevicearray(
             runner.config.device, (timestep,), dtype="int64"
         )
-        noise_pred = runner.ctx.modules.compiled_unet["main"](*inputs).to_host()
+        noise_pred = runner.ctx.modules.compiled_unet["run_forward"](*inputs).to_host()
         sample = scheduler.step(
             torch.from_numpy(noise_pred).cpu(),
             timestep,
diff --git a/models/turbine_models/model_runner.py b/models/turbine_models/model_runner.py
@@ -1,7 +1,7 @@
 import argparse
 import sys
 from iree import runtime as ireert
-#from iree.runtime._binding import create_hal_driver
+from iree.runtime._binding import create_hal_driver
 
 
 class vmfbRunner:
@@ -11,14 +11,14 @@ def __init__(self, device, vmfb_path, external_weight_path=None, extra_plugin=No
         # If an extra plugin is requested, add a global flag to load the plugin
         # and create the driver using the non-caching creation function, as
         # the caching creation function may ignore the flag.
-        # if extra_plugin:
-        #     ireert.flags.parse_flags(f"--executable_plugin={extra_plugin}")
-        #     haldriver = create_hal_driver(device)
+        if extra_plugin:
+            ireert.flags.parse_flags(f"--executable_plugin={extra_plugin}")
+            haldriver = create_hal_driver(device)
 
         # No plugin requested: create the driver with the caching create
         # function.
-        #else:
-        haldriver = ireert.get_driver(device)
+        else:
+            haldriver = ireert.get_driver(device)
         if "://" in device:
             try:
                 device_idx = int(device.split("://")[-1])