[TKW] Drop inplace flag (iree-org#863)

Hardcode84 · web-flow · commit 3abd543bb74b · 2025-05-30T20:13:21.000+02:00
`generate_iree_ref` was switched to `turbine.runtime` launcher (iree-org#861) and it was the last user of `inplace=False` option, drop the flag and related code. `test_scalar_codegen` doesn't really use this flag as it isn't run with iree runtime currently. --------- Signed-off-by: Ivan Butygin <ivan.butygin@gmail.com>
diff --git a/iree/turbine/kernel/wave/compile_options.py b/iree/turbine/kernel/wave/compile_options.py
@@ -28,7 +28,6 @@ class WaveCompileOptions:
     # === Runtime options ===
     kernel_launch_info: KernelLaunchInfo = field(default_factory=KernelLaunchInfo)
     kernel_usages: tuple[KernelBufferUsage] = None
-    inplace: bool = True
 
     # === Backend options ===
     backend: str = "rocm"
diff --git a/iree/turbine/kernel/wave/profiling.py b/iree/turbine/kernel/wave/profiling.py
@@ -31,7 +31,7 @@ def construct_inputs(
     bench_with_constant_weights = options.bench_with_constant_weights
     tempfiles = []
     inputs = []
-    all_inputs = kernel_inputs + kernel_outputs if options.inplace else kernel_inputs
+    all_inputs = kernel_inputs + kernel_outputs
     all_inputs += options.dynamic_symbols_map.values()
     if bench_with_constant_weights:
         for inp in all_inputs:
diff --git a/iree/turbine/kernel/wave/utils/run_utils.py b/iree/turbine/kernel/wave/utils/run_utils.py
@@ -10,8 +10,6 @@
 from typing import Callable, Optional, Any
 import ctypes
 from ..compile_options import WaveCompileOptions
-from .compile_utils import compile_to_vmfb
-from .classes import KernelLaunchInfo
 from ..profiling import benchmark_module
 
 
@@ -24,12 +22,6 @@ def compute_grid(kernel_dynamic_dims: tuple[int], grid_fn: Callable):
     return [int(x) for x in grid_fn(list(kernel_dynamic_dims))]
 
 
-def _read_file(name, mode):
-    with open(name, mode) as file:
-        data = file.read()
-    return data
-
-
 def _write_file(name, mode, data):
     with open(name, mode) as file:
         file.write(data)
@@ -50,37 +42,6 @@ def get_device_uuid(device_list: list[str], device_str: str) -> tuple[int, str]:
     return device_str
 
 
-def _invoke(vm_context, device, entry_function, inputs, outputs, dynamic_dims):
-    arg_list = rt.VmVariantList(len(inputs) + len(dynamic_dims))
-    ret_list = rt.VmVariantList(len(outputs))
-
-    for input in inputs:
-        if isinstance(input, torch.Tensor):
-            input_cpu = input.cpu().contiguous()
-            device_array = rt.asdevicearray(device, input_cpu)
-            arg_list.push_ref(device_array._buffer_view)
-        else:
-            raise ValueError(f"Unsupported input type: {type(input)}")
-
-    for dynamic_dim in dynamic_dims:
-        if isinstance(dynamic_dim, int):
-            arg_list.push_int(dynamic_dim)
-        else:
-            raise ValueError(f"Unsupported dynamic dim type: {type(dynamic_dim)}")
-
-    vm_context.invoke(entry_function, arg_list, ret_list)
-
-    for i, ret in enumerate(outputs):
-        device_buffer_view = rt.HalBufferView.__iree_vm_cast__(ret_list.get_as_ref(i))
-        device_array = rt.DeviceArray(device, device_buffer_view)
-
-        # TODO: Make to_host accept out array/buffer, so we can avoid extra data copy.
-        host_array = device_array.to_host()
-
-        # Convert to torch tensor without actually importing torch.
-        ret[:] = type(ret)(host_array)
-
-
 _dl_tensor_name = ctypes.create_string_buffer(b"dltensor")
 _set_capsule_name = ctypes.pythonapi.PyCapsule_SetName
 
@@ -173,14 +134,13 @@ def invoke_vmfb(
                 options.benchmark_repetitions
             )
 
-    if options.inplace:
-        # Select device as the GPU, where input tensors are coming from.
-        device_list = tuple(
-            input.device
-            for input in kernel_inputs + kernel_outputs
-            if isinstance(input, torch.Tensor)
-        )
-        device = get_device_uuid(device_list, device)
+    # Select device as the GPU, where input tensors are coming from.
+    device_list = tuple(
+        input.device
+        for input in kernel_inputs + kernel_outputs
+        if isinstance(input, torch.Tensor)
+    )
+    device = get_device_uuid(device_list, device)
 
     rt_config = rt.Config(device)
     device = rt_config.device
@@ -202,24 +162,14 @@ def invoke_vmfb(
         if options.kernel_hash:
             RUNTIME_CACHE[options.kernel_hash] = (ctx, func)
 
-    if options.inplace:
-        _inplace_invoke(
-            ctx.vm_context,
-            device,
-            func,
-            kernel_inputs,
-            kernel_outputs,
-            options.dynamic_symbols_map.values(),
-        )
-    else:
-        _invoke(
-            ctx.vm_context,
-            device,
-            func,
-            kernel_inputs,
-            kernel_outputs,
-            options.dynamic_symbols_map.values(),
-        )
+    _inplace_invoke(
+        ctx.vm_context,
+        device,
+        func,
+        kernel_inputs,
+        kernel_outputs,
+        options.dynamic_symbols_map.values(),
+    )
 
     if options.run_bench:
         benchmark_results = benchmark_module(
@@ -278,21 +228,6 @@ def invoke_with_wave_runtime(
     wave_runtime.launch(kernel_launch_info, kernel_args, dyn_dims, scalar_args)
 
 
-def compile_and_invoke(
-    asm: str,
-    kernel_inputs: list[torch.Tensor],
-    kernel_outputs: list[torch.Tensor],
-    options: WaveCompileOptions,
-):
-    compiled_wave_vmfb = compile_to_vmfb(asm, options)
-    invoke_vmfb(
-        compiled_wave_vmfb,
-        options,
-        kernel_inputs,
-        kernel_outputs,
-    )
-
-
 def get_default_arch() -> str:
     """Return default ROCM architecture"""
     if not torch.cuda.is_available():
diff --git a/tests/kernel/wave/wave_e2e_test.py b/tests/kernel/wave/wave_e2e_test.py
@@ -1680,7 +1680,6 @@ def test(
         },
         canonicalize=True,
         run_bench=run_bench,
-        inplace=False,
         wave_runtime=True,
     )
     test = wave_compile(options, test)

Original file line number	Diff line number	Diff line change
`@@ -1680,7 +1680,6 @@ def test(`
`1680`	`1680`	`},`
`1681`	`1681`	`canonicalize=True,`
`1682`	`1682`	`run_bench=run_bench,`
`1683`		`- inplace=False,`
`1684`	`1683`	`wave_runtime=True,`
`1685`	`1684`	`)`
`1686`	`1685`	`test = wave_compile(options, test)`