iree-org
diff --git a/‎lit_tests/kernel/wave/codegen.py‎
Lines changed: 12 additions & 2 deletions b/‎lit_tests/kernel/wave/codegen.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎lit_tests/kernel/wave/location.py‎
Lines changed: 2 additions & 2 deletions b/‎lit_tests/kernel/wave/location.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lit_tests/kernel/wave/sharktank_integration.py‎
Lines changed: 1 addition & 0 deletions b/‎lit_tests/kernel/wave/sharktank_integration.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎wave_lang/kernel/compiler/host_codegen.py‎
Lines changed: 67 additions & 6 deletions b/‎wave_lang/kernel/compiler/host_codegen.py‎
Lines changed: 67 additions & 6 deletions
diff --git a/‎wave_lang/kernel/wave/cache.py‎
Lines changed: 1 addition & 0 deletions b/‎wave_lang/kernel/wave/cache.py‎
Lines changed: 1 addition & 0 deletions
@@ -2167,6 +2167,8 @@ def scalar_codegen_f32(
     scalar_codegen_f32 = wave_compile(options, scalar_codegen_f32)
     print(scalar_codegen_f32.asm)
 
+    # CHECK-LABEL: test_scalar_codegen_f32
+
     # Passed scalars' dtype
     # CHECK: func.func @scalar_codegen_f32(
     # CHECK-SAME: %arg2: f32, %arg3: f32)
@@ -2177,8 +2179,11 @@ def scalar_codegen_f32(
     # CHECK: arith.addf
 
     # Final dispatch args dtype
+    # CHECK: func.func @isolated_benchmark$async(%[[ARG0:.*]]: !hal.buffer_view, %[[ARG1:.*]]: !hal.buffer_view, %[[ARG2:.*]]: f32, %[[ARG3:.*]]: f32
+    # CHECK: %[[V0:.*]] = hal.tensor.import wait(%{{.*}}) => %[[ARG0]]
+    # CHECK: %[[V1:.*]] = hal.tensor.import wait(%{{.*}}) => %[[ARG1]]
     # CHECK: flow.dispatch @scalar_codegen_f32::@scalar_codegen_f32(
-    # CHECK-SAME: %arg0, %arg1, %arg2, %arg3)
+    # CHECK-SAME: %[[V0]], %[[V1]], %[[ARG2]], %[[ARG3]])
 
 
 @run_test
@@ -2220,6 +2225,8 @@ def scalar_codegen_i32(
     scalar_codegen_i32 = wave_compile(options, scalar_codegen_i32)
     print(scalar_codegen_i32.asm)
 
+    # CHECK-LABEL: test_scalar_codegen_i32
+
     # Passed scalars' dtype: i32
     # CHECK: func.func @scalar_codegen_i32(
     # CHECK-SAME: %arg2: i32, %arg3: i32)
@@ -2230,8 +2237,11 @@ def scalar_codegen_i32(
     # CHECK: arith.addi
 
     # Final dispatch args dtype
+    # CHECK: func.func @isolated_benchmark$async(%[[ARG0:.*]]: !hal.buffer_view, %[[ARG1:.*]]: !hal.buffer_view, %[[ARG2:.*]]: i32, %[[ARG3:.*]]: i32
+    # CHECK: %[[V0:.*]] = hal.tensor.import wait(%{{.*}}) => %[[ARG0]]
+    # CHECK: %[[V1:.*]] = hal.tensor.import wait(%{{.*}}) => %[[ARG1]]
     # CHECK: flow.dispatch @scalar_codegen_i32::@scalar_codegen_i32(
-    # CHECK-SAME: %arg0, %arg1, %arg2, %arg3)
+    # CHECK-SAME: %[[V0]], %[[V1]], %[[ARG2]], %[[ARG3]])
 
 
 #  This kernel copies of data from a into b if tid.x < threshold.
 
@@ -71,7 +71,7 @@ def add_loc_local_scope(
     # CHECK: vector.load {{.*}} loc("{{.*}}location.py":{{[0-9]+}}
     # CHECK: arith.addf {{.*}} loc("{{.*}}location.py":{{[0-9]+}}
     #
-    # CHECK: @isolated_benchmark(%{{.*}} loc("a"("{{.*}}location.py":{{[0-9]+}}{{.*}} loc("b"("{{.*}}location.py":{{[0-9]+}}
+    # CHECK: @isolated_benchmark$async(%{{.*}} loc("a"("{{.*}}location.py":{{[0-9]+}}{{.*}} loc("b"("{{.*}}location.py":{{[0-9]+}}
 
 
 @run_test
@@ -98,7 +98,7 @@ def add_loc_global_scope(
     # CHECK-LABEL: @add_loc_global_scope
     # CHECK: vector.load {{.*}} loc(#[[loc_load:.+]])
     # CHECK: arith.addf {{.*}} loc(#[[loc_addf:.+]])
-    # CHECK: @isolated_benchmark(%{{.*}} loc("a"(#[[loc_arg]])), %{{.*}} loc("b"(#[[loc_arg]])))
+    # CHECK: @isolated_benchmark$async(%{{.*}} loc("a"(#[[loc_arg]])), %{{.*}} loc("b"(#[[loc_arg]])), %{{.*}} loc(unknown), %{{.*}} loc(unknown))
     # CHECK-DAG: #[[loc_load]] = loc("{{.*}}location.py":{{[0-9]+}}
     # CHECK-DAG: #[[loc_addf]] = loc("{{.*}}location.py":{{[0-9]+}}
 
 
@@ -165,6 +165,7 @@ def generate(self, ksel: KernelSelection, kb: KernelBuilder):
             func_name=wave_kernel_name,
             compile_to_mlir=True,
             canonicalize=False,
+            iree_launch_async=False,
         )
         options = set_default_run_config(options)
         with Context() as ctx:
 
@@ -19,6 +19,8 @@
     arith_d,
     flow_d,
     func_d,
+    hal_d,
+    tensor_d,
 )
 
 from .._support.indexing import IndexSymbol
@@ -31,7 +33,10 @@
 from .kernel_codegen import BindingDesc, KernelSignature
 
 
-def memref_to_tensor(memrefs: list[IrType]):
+def memref_to_tensor(memrefs: list[IrType], use_views: bool = False):
+    if use_views:
+        view_type = IrType.parse("!hal.buffer_view")
+
     tensors = []
     for m in memrefs:
         # append scalars as-it-is to tensors list
@@ -41,7 +46,7 @@ def memref_to_tensor(memrefs: list[IrType]):
             tensors.append(m)
             continue
         assert isinstance(m, MemRefType)
-        t = RankedTensorType.get(m.shape, m.element_type)
+        t = view_type if use_views else RankedTensorType.get(m.shape, m.element_type)
         tensors.append(t)
     return tensors
 
@@ -79,12 +84,14 @@ def isolated_test_call(
     dynamic_symbols: list[IndexSymbol] = [],
     *,
     location_capture_config: Optional[LocationCaptureConfig] = None,
+    async_dispatch: bool = False,
 ):
     with InsertionPoint(mb.body_block), Location.unknown():
         input_types = [b.as_mlir_type() for b in sig.kernel_buffer_bindings] + [
             b.as_mlir_type() for b in sig.scalar_bindings
         ]
-        input_tensors = memref_to_tensor(input_types)
+
+        input_tensors = memref_to_tensor(input_types, use_views=async_dispatch)
         argument_dims = get_dynamic_dims(sig.kernel_buffer_bindings, dynamic_symbols)
         # Adding unique dynamic dims as inputs.
         input_tensors += [IndexType.get() for _ in list(dict.fromkeys(argument_dims))]
@@ -93,8 +100,13 @@ def isolated_test_call(
             IndexType.get() for _ in set(dynamic_symbols).difference(argument_dims)
         ]
 
+        if async_dispatch:
+            fence_type = IrType.parse("!hal.fence")
+            input_tensors += [fence_type] * 2
+            func_name = func_name + "$async"
+
         output_types = [b.as_mlir_type() for b in sig.kernel_buffer_output_bindings]
-        output_tensors = memref_to_tensor(output_types)
+        output_tensors = memref_to_tensor(output_types, use_views=async_dispatch)
         result_dims = get_dynamic_dims(
             sig.kernel_buffer_output_bindings, dynamic_symbols
         )
@@ -110,13 +122,39 @@ def isolated_test_call(
             + scalar_bindings
             + sig.dynamic_dim_bindings
         ]
+        if async_dispatch:
+            arg_locs += [Location.unknown()] * 2
+
         entry_block = func_op.add_entry_block(arg_locs)
         scalars_offset = len(sig.kernel_buffer_bindings)
         scalars_count = len(scalar_bindings)
         dynamic_offset = scalars_offset + scalars_count
 
         with InsertionPoint(entry_block):
             arguments = entry_block.arguments
+            if async_dispatch:
+                in_fence = arguments[-2]
+                out_fence = arguments[-1]
+                arguments = list(arguments[:-2])
+
+                for i, b in enumerate(sig.kernel_buffer_bindings):
+                    shape = b.kernel_buffer_type.symbolic_shape
+
+                    arg = arguments[i]
+                    arg_type = memref_to_tensor([b.as_mlir_type()])[0]
+                    target_dims = [
+                        hal_d.buffer_view_dim(arg, d)
+                        for d in range(len(shape))
+                        if arg_type.is_dynamic_dim(d)
+                    ]
+                    arguments[i] = hal_d.tensor_import(
+                        arg_type,
+                        arg,
+                        wait_fence=in_fence,
+                        target_encoding=arg_type,
+                        target_dims=target_dims,
+                    )
+
             scalars_args = [
                 to_index(v)
                 for v, b in zip(
@@ -142,13 +180,36 @@ def isolated_test_call(
             )
 
             out = flow_d.DispatchOp(
-                output_tensors,
+                memref_to_tensor(output_types),  # output_tensors,
                 [dynamic_argument_map[dim] for dim in dynamic_symbols] + scalars_args,
                 entrypoints,
-                entry_block.arguments,
+                arguments,
                 [dynamic_argument_map[dim] for dim in argument_dims],
                 [dynamic_argument_map[dim] for dim in result_dims],
                 tied_operands=tied_operands,
             )
 
+            if async_dispatch:
+                out = list(out.results)
+                out_types = memref_to_tensor(
+                    [b.as_mlir_type() for b in sig.kernel_buffer_output_bindings]
+                )
+                barrier = hal_d.tensor_barrier(out_types, out, signal_fence=out_fence)
+                if len(out_types) == 1:
+                    barrier = [barrier]
+
+                view_type = IrType.parse("!hal.buffer_view")
+                for i, b in enumerate(sig.kernel_buffer_output_bindings):
+                    shape = b.kernel_buffer_type.symbolic_shape
+
+                    out_type = out_types[i]
+                    source_dims = [
+                        tensor_d.dim(out[i], arith_d.constant(IndexType.get(), d))
+                        for d in range(len(shape))
+                        if out_type.is_dynamic_dim(d)
+                    ]
+                    out[i] = hal_d.tensor_export(
+                        view_type, barrier[i], out_type, source_dims=source_dims
+                    )
+
             func_d.ReturnOp(out)
@@ -199,6 +199,7 @@ def get_hash(
             options.optimization_level,
             options.denorm_fp_math_f32,
             options.waves_per_eu,
+            options.iree_launch_async,
             options.use_buffer_load_ops,
             options.use_buffer_store_ops,
             options.use_stride_cache_swizzle,
Original file line number	Diff line number	Diff line change
`@@ -165,6 +165,7 @@ def generate(self, ksel: KernelSelection, kb: KernelBuilder):`
`165`	`165`	`func_name=wave_kernel_name,`
`166`	`166`	`compile_to_mlir=True,`
`167`	`167`	`canonicalize=False,`
	`168`	`+ iree_launch_async=False,`
`168`	`169`	`)`
`169`	`170`	`options = set_default_run_config(options)`
`170`	`171`	`with Context() as ctx:`