Add disk caching for Water backend via object files

Hardcode84 · claude · Hardcode84 · commit 3694ae0b00c5 · 2026-03-10T12:23:34.000+01:00
After JIT compilation, dump the host object file (which embeds the GPU
binary) to the cache directory.  On cache hit, load the .o directly
into the execution engine, skipping MLIR parsing, LLVM IR translation,
and host code compilation entirely.

- Add ExecutionEngine::loadFromObjectFile (C++ + Python binding).
- Enable object cache by default so dump_to_object_file works.
- Include use_water_backend in cache hash to avoid collisions.
- Add testWaterBackendCache covering miss, store, hit, and correctness.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: Ivan Butygin &lt;ivan.butygin@gmail.com&gt;
diff --git a/tests/kernel/runtime/cache_test.py b/tests/kernel/runtime/cache_test.py
@@ -51,6 +51,7 @@
     require_cdna_3_or_4,
     require_rdna4,
     require_e2e,
+    require_water_and_ee,
 )
 
 require_cache = pytest.mark.skipif(
@@ -868,3 +869,87 @@ def simple_copy(
     assert kernel2.gpu_binary_path.endswith(
         ".hsaco"
     ), "Expected .hsaco extension for cached kernel"
+
+
+@require_e2e
+@require_cache
+@require_water_and_ee
+def testWaterBackendCache(tmp_path):
+    """Test that Water backend object file caching works correctly."""
+
+    reset_cache_manager(tmp_path)
+
+    M = tkl.sym.M
+    N = tkl.sym.N
+    ADDRESS_SPACE = tkl.sym.ADDRESS_SPACE
+
+    wave_size = 64
+    BLOCK_M = 1
+    BLOCK_N = 256
+
+    constraints: list[tkw.Constraint] = [
+        tkw.HardwareConstraint(
+            threads_per_wave=wave_size,
+            vector_shapes={M: BLOCK_M, N: BLOCK_N},
+        ),
+        tkw.WorkgroupConstraint(M, BLOCK_M, 1),
+        tkw.WorkgroupConstraint(N, BLOCK_N, 0),
+        tkw.WaveConstraint(M, BLOCK_M),
+        tkw.WaveConstraint(N, BLOCK_N),
+    ]
+
+    @tkw.wave(constraints)
+    def simple_copy(
+        a: tkl.Memory[M, N, ADDRESS_SPACE, tkl.f16],
+        b: tkl.Memory[M, N, ADDRESS_SPACE, tkl.f16],
+    ):
+        res = tkw.read(a)
+        tkw.write(res, b)
+
+    hyperparams = {
+        ADDRESS_SPACE: GLOBAL_ADDRESS_SPACE,
+        M: 16,
+        N: 256,
+    }
+
+    cache_manager = get_cache_manager()
+
+    options = WaveCompileOptions(
+        subs=copy.deepcopy(hyperparams),
+        canonicalize=True,
+        use_water_backend=True,
+    )
+    options = set_default_run_config(options)
+
+    # Before compilation, nothing in cache.
+    assert len(cache_manager.session_cache) == 0, "Expected empty cache at start."
+
+    # First compilation -- cache miss, should produce and store an object file.
+    kernel1 = wave_compile(options, simple_copy)
+
+    assert (
+        cache_manager.cache_misses == 1 and cache_manager.cache_hits == 0
+    ), "Expected first compilation to be a cache miss."
+    assert len(cache_manager.session_cache) == 1, "Expected one entry in session cache."
+
+    # Verify object file was written to cache directory.
+    kernel_hash = options.kernel_hash
+    obj_path = tmp_path / kernel_hash / (kernel_hash + ".o")
+    assert obj_path.exists(), f"Expected object file at {obj_path}."
+    assert obj_path.stat().st_size > 0, "Object file should not be empty."
+
+    a = device_randn(16, 256, dtype=torch.float16)
+    b = device_zeros(16, 256, dtype=torch.float16)
+    kernel1(a, b)
+    assert_close(a, b)
+
+    # Second compilation -- cache hit, should load from object file.
+    b2 = device_zeros(16, 256, dtype=torch.float16)
+    kernel2 = wave_compile(options, simple_copy)
+
+    assert (
+        cache_manager.cache_misses == 1 and cache_manager.cache_hits == 1
+    ), "Expected second compilation to be a cache hit."
+
+    kernel2(a, b2)
+    assert_close(a, b2)
diff --git a/wave_lang/kernel/wave/cache.py b/wave_lang/kernel/wave/cache.py
@@ -214,6 +214,7 @@ def get_hash(
             options.reorder_allocs,
             options.override_schedule,
             options.use_bound_check,
+            options.use_water_backend,
         ]
 
         # Add kernel/helper function specific hashes.
diff --git a/wave_lang/kernel/wave/compile.py b/wave_lang/kernel/wave/compile.py
@@ -115,7 +115,10 @@
 from ..compiler import host_codegen, kernel_codegen, builder, dispatch_codegen
 from ..compiler.wave_codegen import WaveEmitter
 from .compile_options import WaveCompileOptions
+from pathlib import Path
+
 from .cache import (
+    get_cache_base_dir,
     get_cache_manager,
     get_temp_binary_dir,
     is_cache_enabled,
@@ -386,24 +389,23 @@ def __init__(
         self._module_handle = None
         self._host_func_ptr = None
 
-        # Serialize MLIR module to text if needed
-        # TODO: investigate why bytecode deserialization is not working
+        # Serialize MLIR module to text if needed.
+        # TODO: investigate why bytecode deserialization is not working.
         if isinstance(module, (bytes, str)):
-            # Assume it's already MLIR text
             optimized_mlir = module.decode() if isinstance(module, bytes) else module
         else:
-            # Serialize the MLIR module to text
             optimized_mlir = str(module)
 
-        # Get the execution engine instance and load the module
         from wave_lang.kernel.wave.execution_engine import get_execution_engine
 
         if not create_execution_engine:
             return
         self._engine = get_execution_engine()
         self._module_handle = self._engine.load_module_from_text(optimized_mlir)
+        self._bind_host_func()
 
-        # Look up the host wrapper function
+    def _bind_host_func(self):
+        """Look up the host wrapper function and create a ctypes callable."""
         func_name = self.options.func_name
         try:
             self._host_func_ptr = self._engine.lookup(self._module_handle, func_name)
@@ -413,32 +415,49 @@ def __init__(
                 f"Make sure the module was compiled with emit_host_func. Error: {e}"
             )
 
-        # Create ctypes function type
-        # The host wrapper signature is: void func(void* stream, PyObject* arg0, PyObject* arg1, ...)
-
+        # The host wrapper signature is:
+        # void func(void* stream, PyObject* arg0, PyObject* arg1, ...).
         num_kernel_args = len(self.options.kernel_usages)
-        arg_types = [ctypes.c_void_p] + [
-            py_object
-        ] * num_kernel_args  # +1 for stream pointer
+        arg_types = [ctypes.c_void_p] + [py_object] * num_kernel_args
         func_type = ctypes.CFUNCTYPE(None, *arg_types)
         self._cfunc = func_type(self._host_func_ptr)
 
+    def dump_to_object_file(self, path: str):
+        """Dump the compiled host object file (with embedded GPU binary) to disk."""
+        assert self._engine is not None, "no execution engine to dump from"
+        self._engine.dump_to_object_file(path)
+
+    @classmethod
+    def from_object_file(
+        cls,
+        options: WaveCompileOptions,
+        object_file_path: str,
+        mlir_asm: str = "",
+    ) -> "WaveKernelExecutionEngine":
+        """Load a cached object file instead of compiling from MLIR."""
+        from wave_lang.kernel.wave.execution_engine import get_execution_engine
+
+        instance = cls.__new__(cls)
+        instance.options = options
+        instance.asm = mlir_asm
+        instance._engine = get_execution_engine()
+        instance._module_handle = instance._engine.load_from_object_file(
+            object_file_path
+        )
+        instance._bind_host_func()
+        return instance
+
     def __call__(self, *args):
         return self.invoke(*args)
 
     def invoke(self, *args) -> None:
-        """
-        Invokes the wave kernel with the given arguments using the ExecutionEngine.
-        """
+        """Invoke the wave kernel with the given arguments using the ExecutionEngine."""
         assert (
             self._engine is not None
-        ), "Cannot invoke kernel without creating an execution engine. Revise the constructor call."
+        ), "Cannot invoke kernel without creating an execution engine."
 
-        # Get the current stream
         stream_ptr = torch.cuda.current_stream().cuda_stream
-
-        # Call the JIT-compiled host wrapper function
-        # Signature: void func(void* stream, PyObject* arg0, PyObject* arg1, ...)
+        # Signature: void func(void* stream, PyObject* arg0, PyObject* arg1, ...).
         self._cfunc(stream_ptr, *(py_object(arg) for arg in args))
 
 
@@ -1013,6 +1032,11 @@ def get_binary_path():
         else:
             return glob.glob(str(get_temp_binary_dir() / "*.hsaco"))[0]
 
+    def _get_water_object_cache_path(kernel_hash: str) -> Path:
+        """Return the path for a cached Water object file."""
+        base = cache_manager.base_dir if cache_manager else get_cache_base_dir()
+        return base / kernel_hash / (kernel_hash + ".o")
+
     # Create an indexing context and populate substitutions.
     with IndexingContext() as idxc:
         idxc.set_subs(options.subs)
@@ -1037,22 +1061,32 @@ def get_binary_path():
             if cached_kernel:
                 options.kernel_usages = cached_kernel.kernel_sig
                 options.kernel_launch_info = cached_kernel.kernel_launch_info
-                if options.wave_runtime:
-                    binary_path = get_binary_path()
 
                 if options.print_mlir:
                     print(cached_kernel.asm)
 
-                return cls(
-                    options,
-                    cached_kernel.vmfb,
-                    cached_kernel.asm,
-                    binary_path,
-                    bound_scalar_symbols,
-                    symbols_args_map,
-                    None,
-                    None,
-                )
+                if options.use_water_backend:
+                    obj_path = _get_water_object_cache_path(options.kernel_hash)
+                    if obj_path.exists():
+                        return WaveKernelExecutionEngine.from_object_file(
+                            options, str(obj_path), cached_kernel.asm
+                        )
+                    # Object file missing from cache, fall through
+                    # to recompilation.
+                else:
+                    if options.wave_runtime:
+                        binary_path = get_binary_path()
+
+                    return cls(
+                        options,
+                        cached_kernel.vmfb,
+                        cached_kernel.asm,
+                        binary_path,
+                        bound_scalar_symbols,
+                        symbols_args_map,
+                        None,
+                        None,
+                    )
 
         # For the wave runtime, we need the hsaco binary. So we turn on
         # dumping of binaries and store in wave runtime directory. If we
@@ -1176,12 +1210,25 @@ def get_binary_path():
                 _compile_asm_to_binary(asm, options)
         elif options.use_water_backend:
             module = water_lowering_pipeline(mb.module_op, options)
-            return WaveKernelExecutionEngine(
+            engine = WaveKernelExecutionEngine(
                 options,
                 module,
                 asm,
                 create_execution_engine=not options.compile_to_mlir,
             )
+            # Cache the compiled object file for future runs.
+            if (
+                is_cache_enabled()
+                and cache_manager is not None
+                and options.kernel_hash
+                and not debug_arg_info
+                and not options.compile_to_mlir
+            ):
+                obj_path = _get_water_object_cache_path(options.kernel_hash)
+                obj_path.parent.mkdir(parents=True, exist_ok=True)
+                engine.dump_to_object_file(str(obj_path))
+                cache_manager.store_kernel(None, asm, options)
+            return engine
         elif not options.compile_to_mlir:
             # LLVM flow: only compile to VMFB when not in MLIR-only mode
             compiled_wave_vmfb = compile_to_vmfb(asm, options)
diff --git a/wave_lang/kernel/wave/execution_engine/bindings.cpp b/wave_lang/kernel/wave/execution_engine/bindings.cpp
@@ -191,6 +191,24 @@ NB_MODULE(wave_execution_engine, m) {
 
 Raises:
     RuntimeError: If function lookup fails)")
+      .def(
+          "load_from_object_file",
+          [](wave::ExecutionEngine &self, const std::string &filename) {
+            return reinterpret_cast<uintptr_t>(
+                unwrapExpected(self.loadFromObjectFile(filename),
+                               "Failed to load object file"));
+          },
+          nb::arg("filename"),
+          R"(Load a pre-compiled object file into the execution engine.
+
+Args:
+    filename: Path to the object file
+
+Returns:
+    Module handle as integer
+
+Raises:
+    RuntimeError: If loading fails)")
       .def(
           "dump_to_object_file",
           [](wave::ExecutionEngine &self, const std::string &filename) {
diff --git a/wave_lang/kernel/wave/execution_engine/execution_engine.cpp b/wave_lang/kernel/wave/execution_engine/execution_engine.cpp
@@ -368,6 +368,49 @@ wave::ExecutionEngine::lookup(wave::ExecutionEngine::ModuleHandle handle,
   return makeStringError("looked up function is null");
 }
 
+llvm::Expected<wave::ExecutionEngine::ModuleHandle>
+wave::ExecutionEngine::loadFromObjectFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFile(filename);
+  if (!fileOrErr)
+    return makeStringError("could not open object file '" + filename +
+                           "': " + fileOrErr.getError().message());
+
+  // Create a unique JITDylib for this object.
+  llvm::orc::JITDylib *dylib = nullptr;
+  while (true) {
+    std::string uniqueName =
+        (llvm::Twine("module") + llvm::Twine(uniqueNameCounter++)).str();
+    if (jit->getJITDylibByName(uniqueName))
+      continue;
+
+    llvm::Expected<llvm::orc::JITDylib &> res =
+        jit->createJITDylib(std::move(uniqueName));
+    if (!res)
+      return res.takeError();
+
+    dylib = &res.get();
+    break;
+  }
+  assert(dylib && "failed to create JITDylib");
+
+  const llvm::DataLayout &dataLayout = jit->getDataLayout();
+  dylib->addGenerator(
+      cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
+          dataLayout.getGlobalPrefix())));
+
+  if (symbolMap)
+    cantFail(
+        dylib->define(absoluteSymbols(symbolMap(llvm::orc::MangleAndInterner(
+            dylib->getExecutionSession(), dataLayout)))));
+
+  // Use cantFail here because recovering from a partially loaded dylib would
+  // leave the execution engine in an inconsistent state.
+  llvm::cantFail(jit->addObjectFile(*dylib, std::move(fileOrErr.get())));
+  llvm::cantFail(jit->initialize(*dylib));
+  return static_cast<ModuleHandle>(dylib);
+}
+
 llvm::Error wave::ExecutionEngine::dumpToObjectFile(llvm::StringRef filename) {
   if (cache == nullptr)
     return makeStringError("cannot dump ExecutionEngine object code to file: "
diff --git a/wave_lang/kernel/wave/execution_engine/execution_engine.h b/wave_lang/kernel/wave/execution_engine/execution_engine.h
@@ -96,6 +96,9 @@ class ExecutionEngine {
   llvm::Expected<void *> lookup(ModuleHandle handle,
                                 llvm::StringRef name) const;
 
+  /// Load a pre-compiled object file into the execution engine.
+  llvm::Expected<ModuleHandle> loadFromObjectFile(llvm::StringRef filename);
+
   /// Dump object code to output file `filename`.
   llvm::Error dumpToObjectFile(llvm::StringRef filename);
 
diff --git a/wave_lang/kernel/wave/execution_engine/execution_engine.py b/wave_lang/kernel/wave/execution_engine/execution_engine.py
@@ -158,7 +158,7 @@ def _create_options_from_env() -> "ExecutionEngineOptions":
     def _env_enabled(var: str, default: str = "0") -> bool:
         return bool(int(os.environ.get(var, default)))
 
-    options.enable_object_cache = _env_enabled("WAVE_ENABLE_OBJECT_CACHE")
+    options.enable_object_cache = _env_enabled("WAVE_ENABLE_OBJECT_CACHE", "1")
     options.enable_gdb_notification_listener = _env_enabled("WAVE_ENABLE_GDB_LISTENER")
     options.enable_perf_notification_listener = _env_enabled(
         "WAVE_ENABLE_PERF_LISTENER"

Original file line number	Diff line number	Diff line change
`@@ -214,6 +214,7 @@ def get_hash(`
`214`	`214`	`options.reorder_allocs,`
`215`	`215`	`options.override_schedule,`
`216`	`216`	`options.use_bound_check,`
	`217`	`+ options.use_water_backend,`
`217`	`218`	`]`
`218`	`219`
`219`	`220`	`# Add kernel/helper function specific hashes.`