[boo tuner] Switch to newer BOO backend (#2670)

rkayaith · web-flow · commit 4586fab6367f · 2025-11-14T02:18:21.000-05:00
This switches the BOO tuner to use the newer `iree_boo_experimental` BOO
backend, which uses `torch.compile` under the hood.
diff --git a/sharktuner/boo_tuner/README.md b/sharktuner/boo_tuner/README.md
@@ -34,7 +34,7 @@ source ../iree-build/.env && export PYTHONPATH
 ([IREE Turbine ROCm requirements](https://github.com/iree-org/iree-turbine/blob/main/pytorch-rocm-requirements.txt)):
 
 ```shell
-pip install --index-url https://download.pytorch.org/whl/rocm6.3 'torch>=2.5,<2.8'
+pip install --index-url https://download.pytorch.org/whl/rocm6.4 'torch>=2.9'
 ```
 
 ### Install IREE Turbine
diff --git a/sharktuner/boo_tuner/boo_tuner.py b/sharktuner/boo_tuner/boo_tuner.py
@@ -248,11 +248,14 @@ def process_boo_command(
     boo_path_config: BooPathConfig,
     root_logger: logging.Logger,
     starter_td_spec: Path | None,
-    boo_runtime,
-    get_launchable,
-    BooOpRegistry,
 ) -> Path | None:
     """Process a single BOO command through compilation and tuning."""
+    # These imports are slow due to a pytorch dependency. Keeping them local helps
+    # make '--help' fast.
+    from iree.turbine.kernel.boo.driver.launch import get_launchable
+    from iree.turbine.kernel.boo import runtime as boo_runtime
+    from iree.turbine.kernel.boo.op_exports.registry import BooOpRegistry
+
     sig = BooOpRegistry.parse_command(cli_args, ignore_unhandled_args=True)
     if sig is None:
         raise ValueError(f"Boo op registry failed to parse '{shlex.join(cli_args)}'.")
@@ -272,8 +275,14 @@ def process_boo_command(
 
     # Run BOO compilation and extract source IR.
     with boo_runtime.use_cache_dir(boo_cache_dir):
+        # The "iree_boo" backend offloads to IREE in cases where we expect
+        # performance to be better, and falls back to pytorch otherwise. We use
+        # the experimental backend here instead, as we want to use IREE in all
+        # cases.
         # Note: device="cuda" is correct for AMD GPUs.
-        get_launchable(sig)(*sig.get_sample_args(device="cuda", seed=123))
+        sig.get_compiled_module(backend="iree_boo_experimental")(
+            *sig.get_sample_args(device="cuda", seed=123)
+        )
     [op_cache_dir] = os.listdir(boo_cache_dir)
     op_cache_path = boo_cache_dir / op_cache_dir
 
@@ -347,20 +356,12 @@ def process_boo_command(
     return args.output_td_spec if best_spec_path else None
 
 
-def load_boo() -> tuple[types.ModuleType, Callable, type]:
-    """Load BOO runtime modules.
-
-    These imports are slow due to a pytorch dependency. Keeping them in a
-    separate function helps make '--help' fast.
-    """
-    from iree.turbine.kernel.boo import runtime as boo_runtime
-    from iree.turbine.kernel.boo.driver.launch import get_launchable
-    from iree.turbine.kernel.boo.op_exports.registry import BooOpRegistry
-
-    return boo_runtime, get_launchable, BooOpRegistry
-
-
 def main() -> None:
+    # Set saner defaults for pytorch/miopen environment variables. This affects
+    # pytorch's inferred tensor layouts on AMDGPU, even when not actually using
+    # MIOpen kernels, and are required for performance.
+    os.environ.setdefault("PYTORCH_MIOPEN_SUGGEST_NHWC", "1")
+
     parsed_args: tuple[argparse.Namespace, list[str]] = parse_args()
     args, miopen_op_args = parsed_args
 
@@ -382,7 +383,6 @@ def main() -> None:
         libtuner.validate_devices(args.devices)
         logging.info("Validation successful!")
 
-    boo_runtime, get_launchable, BooOpRegistry = load_boo()
     logging.getLogger("turbine").setLevel(logging.WARNING)
 
     mio_args = load_commands_from_file_or_args(args.commands_file, miopen_op_args)
@@ -398,9 +398,6 @@ def main() -> None:
             boo_path_config,
             root_logger,
             starter_td_spec,
-            boo_runtime,
-            get_launchable,
-            BooOpRegistry,
         )
 
         # Update starter spec for next iteration if tuning succeeded.
diff --git a/sharktuner/boo_tuner/example_configs.txt b/sharktuner/boo_tuner/example_configs.txt
@@ -1,2 +1,2 @@
-convbfp16 -n 3 -c 2016 -H 1 -W 1 -k 224 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -l 1 -j 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC -m conv -g 1 -F 1 -t 1
+convbfp16 -n 3 -c 2016 -H 4 -W 4 -k 224 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -l 1 -j 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC -m conv -g 1 -F 1 -t 1
 gemmfp16 --a_w 32 --a_h 64 --b_w 128

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-convbfp16 -n 3 -c 2016 -H 1 -W 1 -k 224 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -l 1 -j 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC -m conv -g 1 -F 1 -t 1`
	`1`	`+convbfp16 -n 3 -c 2016 -H 4 -W 4 -k 224 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -l 1 -j 1 --in_layout NHWC --fil_layout NHWC --out_layout NHWC -m conv -g 1 -F 1 -t 1`
`2`	`2`	`gemmfp16 --a_w 32 --a_h 64 --b_w 128`