Multi-device support (SDXL)

monorimet · monorimet · commit f6ab086dd19a · 2024-06-17T10:37:36.000-05:00
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py b/models/turbine_models/custom_models/sd3_inference/sd3_cmd_opts.py
@@ -177,6 +177,14 @@ def is_valid_file(arg):
     help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
 )
 
+p.add_argument(
+    "--npu_delegate_path",
+    type=str,
+    default=None,
+    help="Path to npu executable plugin .dll for running VAE on NPU.",
+)
+
+
 p.add_argument(
     "--clip_device",
     default=None,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py b/models/turbine_models/custom_models/sd3_inference/sd3_pipeline.py
@@ -774,13 +774,18 @@ def run_diffusers_cpu(
         args.vae_decomp_attn,
         custom_vae=None,
         cpu_scheduling=args.cpu_scheduling,
+        vae_precision=args.vae_precision,
     )
     vmfbs, weights = sd3_pipe.check_prepared(mlirs, vmfbs, weights)
     if args.cpu_scheduling:
         vmfbs.pop("scheduler")
         weights.pop("scheduler")
+    if args.npu_delegate_path:
+        extra_device_args = {"npu_delegate_path": args.npu_delegate_path}
+    else:
+        extra_device_args = {}
     sd3_pipe.load_pipeline(
-        vmfbs, weights, args.compiled_pipeline, args.split_scheduler
+        vmfbs, weights, args.compiled_pipeline, args.split_scheduler, extra_device_args=extra_device_args
     )
     sd3_pipe.generate_images(
         args.prompt,
diff --git a/models/turbine_models/custom_models/sd3_inference/sd3_vae.py b/models/turbine_models/custom_models/sd3_inference/sd3_vae.py
@@ -90,6 +90,9 @@ def export_vae_model(
         )
         return vmfb_path
 
+    if device == "cpu":
+        decomp_attn = True
+
     if dtype == torch.float16:
         vae_model = vae_model.half()
     mapper = {}
diff --git a/models/turbine_models/custom_models/sd_inference/schedulers.py b/models/turbine_models/custom_models/sd_inference/schedulers.py
@@ -160,7 +160,6 @@ def initialize(self, sample):
         step_indexes = torch.tensor(len(self.module.timesteps))
         timesteps = self.timesteps
         sample = sample * self.module.init_noise_sigma
-        print(sample, add_time_ids, step_indexes, timesteps)
         add_time_ids = ireert.asdevicearray(self.dest, add_time_ids, self.dtype)
         return sample, add_time_ids, step_indexes, timesteps
 
@@ -184,11 +183,6 @@ def step(self, noise_pred, t, latents, guidance_scale, i):
             noise_pred = noise_pred_uncond + guidance_scale * (
                 noise_pred_text - noise_pred_uncond
             )
-        print(
-            noise_pred[:, :, 0, 2],
-            t,
-            latents[:, :, 0, 2],
-        )
         return self.module.step(
             noise_pred,
             t,
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_cmd_opts.py
@@ -125,7 +125,7 @@ def is_valid_file(arg):
 
 p.add_argument(
     "--split_scheduler",
-    default=False,
+    default=True,
     action="store_true",
     help="Use a decoupled unet and scheduler for better QOL.",
 )
@@ -158,6 +158,62 @@ def is_valid_file(arg):
     help="Do one-shot inference from tokens to image in a shrink-wrapped pipeline binary.",
 )
 
+p.add_argument(
+    "--vae_precision",
+    type=str,
+    default="fp16",
+    help="Precision of VAE weights and graph.",
+)
+
+p.add_argument(
+    "--npu_delegate_path",
+    type=str,
+    default=None,
+    help="Path to npu executable plugin .dll for running VAE on NPU.",
+)
+
+p.add_argument(
+    "--clip_device",
+    default=None,
+    type=str,
+    help="Device to run CLIP on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--unet_device",
+    default=None,
+    type=str,
+    help="Device to run unet on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--vae_device",
+    default=None,
+    type=str,
+    help="Device to run VAE on. If None, defaults to the device specified in args.device.",
+)
+
+p.add_argument(
+    "--clip_target",
+    default=None,
+    type=str,
+    help="IREE target for CLIP compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
+p.add_argument(
+    "--unet_target",
+    default=None,
+    type=str,
+    help="IREE target for unet compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
+p.add_argument(
+    "--vae_target",
+    default=None,
+    type=str,
+    help="IREE target for vae compilation. If None, defaults to the target specified by --iree_target_triple.",
+)
+
 ##############################################################################
 # SDXL Modelling Options
 #    These options are used to control model defining parameters for SDXL.
diff --git a/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py b/models/turbine_models/custom_models/sdxl_inference/sdxl_compiled_pipeline.py
diff --git a/models/turbine_models/custom_models/sdxl_inference/vae.py b/models/turbine_models/custom_models/sdxl_inference/vae.py

Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,9 @@ def export_vae_model(`
`90`	`90`	`)`
`91`	`91`	`return vmfb_path`
`92`	`92`
	`93`	`+ if device == "cpu":`
	`94`	`+ decomp_attn = True`
	`95`	`+`
`93`	`96`	`if dtype == torch.float16:`
`94`	`97`	`vae_model = vae_model.half()`
`95`	`98`	`mapper = {}`