Bump IREE to iree-org/iree@337c8aaf (#1349)

yzhang93 · web-flow · commit b5dee683b5bd · 2025-08-21T21:46:09.000+01:00
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_transpose_a_KxM_KxN.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul_transpose_a_KxM_KxN.mlir
@@ -6,7 +6,13 @@ func.func @matmul_transpose_a(%arg0: tensor<${K}x${M}x${TYPE1}>, %arg1: tensor<$
   %cst = arith.constant ${ZERO} : ${TYPE2}
   %0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
   %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
-  %2 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<${K}x${M}x${TYPE1}>, tensor<${K}x${N}x${TYPE1}>)
+  %2 = linalg.matmul
+    indexing_maps = [
+      affine_map<(d0, d1, d2) -> (d2, d0)>,
+      affine_map<(d0, d1, d2) -> (d2, d1)>,
+      affine_map<(d0, d1, d2) -> (d0, d1)>
+    ]
+    ins(%arg0, %arg1 : tensor<${K}x${M}x${TYPE1}>, tensor<${K}x${N}x${TYPE1}>)
     outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
   return %2: tensor<${M}x${N}x${TYPE2}>
 }
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_transpose_b_MxK_NxK.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul_transpose_b_MxK_NxK.mlir
@@ -6,7 +6,13 @@ func.func @matmul_transpose_b(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<$
   %cst = arith.constant ${ZERO} : ${TYPE2}
   %0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
   %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
-  %2 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${N}x${K}x${TYPE1}>)
+  %2 = linalg.matmul
+    indexing_maps = [
+      affine_map<(d0, d1, d2) -> (d0, d2)>,
+      affine_map<(d0, d1, d2) -> (d1, d2)>,
+      affine_map<(d0, d1, d2) -> (d0, d1)>
+    ]
+    ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${N}x${K}x${TYPE1}>)
     outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
   return %2: tensor<${M}x${N}x${TYPE2}>
 }
diff --git a/build_tools/ci/generate_e2e_matmul_tests.py b/build_tools/ci/generate_e2e_matmul_tests.py
@@ -261,7 +261,7 @@ def generate_function(
     acc_tensor_type = f"tensor<{acc_m}x{acc_n}x{acc_type.value}>"
 
     if transpose_rhs:
-        op_name = "linalg.matmul_transpose_b"
+        op_name = "linalg.matmul indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]"
     else:
         op_name = "linalg.matmul"
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp
@@ -253,7 +253,7 @@ bool isMatmul(linalg::LinalgOp linalgOp) {
 bool isMatmulTransposeA(linalg::LinalgOp linalgOp) {
   // Step 0. Test if the op itself is a linalg.matmul_transpose_a op.
   if (isa<linalg::MatmulTransposeAOp, linalg::BatchMatmulTransposeAOp>(
-          linalgOp))
+          linalgOp.getOperation()))
     return true;
   if (!isa<linalg::GenericOp>(linalgOp)) return false;
 
@@ -282,7 +282,7 @@ bool isMatmulTransposeA(linalg::LinalgOp linalgOp) {
 bool isMatmulTransposeB(linalg::LinalgOp linalgOp) {
   // Step 0. Test if the op itself is a linalg.matmul_transpose_b op.
   if (isa<linalg::MatmulTransposeBOp, linalg::BatchMatmulTransposeBOp>(
-          linalgOp))
+          linalgOp.getOperation()))
     return true;
   if (!isa<linalg::GenericOp>(linalgOp)) return false;
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_workgroup_count.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_workgroup_count.mlir
@@ -2,7 +2,7 @@
 hal.executable private @test {
   hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd"}>) {
     hal.executable.export public @test_export ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>]>) count(%arg0: !hal.device) -> (index, index, index) {
-      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
       hal.return %x, %y, %z : index, index, index
     }
     builtin.module {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir
@@ -71,8 +71,15 @@ builtin.module {
     %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xi32>> -> tensor<1024x512xi32>
     %5 = tensor.empty() : tensor<256x1024xi32>
     %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<256x1024xi32>) -> tensor<256x1024xi32>
-    // CHECK:  linalg.matmul_transpose_b {lowering_config = #config, packing_config = #packingConfig}
-    %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<256x512xi32>, tensor<1024x512xi32>) outs(%6 : tensor<256x1024xi32>) -> tensor<256x1024xi32>
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
+    %7 = linalg.matmul
+      indexing_maps = [
+        affine_map<(d0, d1, d2) -> (d0, d2)>,
+        affine_map<(d0, d1, d2) -> (d1, d2)>,
+        affine_map<(d0, d1, d2) -> (d0, d1)>
+      ]
+      ins(%3, %4 : tensor<256x512xi32>, tensor<1024x512xi32>)
+      outs(%6 : tensor<256x1024xi32>) -> tensor<256x1024xi32>
     iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<256x1024xi32>>
     return
   }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir
@@ -174,8 +174,15 @@ module {
     %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x256xbf16>> -> tensor<128x256xbf16>
     %5 = tensor.empty() : tensor<128x128xf32>
     %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32>
-    // CHECK:  linalg.matmul_transpose_b {lowering_config = #config, packing_config = #packingConfig}
-    %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<128x256xbf16>, tensor<128x256xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32>
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
+    %7 = linalg.matmul
+      indexing_maps = [
+        affine_map<(d0, d1, d2) -> (d0, d2)>,
+        affine_map<(d0, d1, d2) -> (d1, d2)>,
+        affine_map<(d0, d1, d2) -> (d0, d1)>
+      ]
+      ins(%3, %4 : tensor<128x256xbf16>, tensor<128x256xbf16>)
+      outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32>
     iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<128x128xf32>>
     return
   }
@@ -210,8 +217,15 @@ module {
     %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<256x128xbf16>> -> tensor<256x128xbf16>
     %5 = tensor.empty() : tensor<128x128xf32>
     %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x128xf32>) -> tensor<128x128xf32>
-    // CHECK:  linalg.matmul_transpose_a {lowering_config = #config, packing_config = #packingConfig}
-    %7 = linalg.matmul_transpose_a ins(%3, %4 : tensor<256x128xbf16>, tensor<256x128xbf16>) outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32>
+    // CHECK:  linalg.matmul {lowering_config = #config, packing_config = #packingConfig}
+    %7 = linalg.matmul
+      indexing_maps = [
+        affine_map<(d0, d1, d2) -> (d2, d0)>,
+        affine_map<(d0, d1, d2) -> (d2, d1)>,
+        affine_map<(d0, d1, d2) -> (d0, d1)>
+      ]
+      ins(%3, %4 : tensor<256x128xbf16>, tensor<256x128xbf16>)
+      outs(%6 : tensor<128x128xf32>) -> tensor<128x128xf32>
     iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<128x128xf32>>
     return
   }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/pack_and_transpose_level1.mlir
@@ -34,6 +34,11 @@ func.func @matmul_transpose_b_dispatch_0_matmul_transpose_b_256x1024x512_i32(%ar
   // CHECK:       linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [64, 64] into %{{.*}} : tensor<256x1024xi32> -> tensor<4x16x64x64xi32>
   // CHECK:       linalg.generic
   // CHECK-SAME:  attrs =  {lowering_config = #config, packing_config = #packingConfig}
-  %2 = linalg.matmul_transpose_b {lowering_config = #config, packing_config = #packingConfig} ins(%arg0, %arg1 : tensor<256x512xi32>, tensor<1024x512xi32>) outs(%1 : tensor<256x1024xi32>) -> tensor<256x1024xi32>
+  %2 = linalg.matmul
+    indexing_maps = [
+      affine_map<(d0, d1, d2) -> (d0, d2)>,
+      affine_map<(d0, d1, d2) -> (d1, d2)>,
+      affine_map<(d0, d1, d2) -> (d0, d1)>
+    ] {lowering_config = #config, packing_config = #packingConfig} ins(%arg0, %arg1 : tensor<256x512xi32>, tensor<1024x512xi32>) outs(%1 : tensor<256x1024xi32>) -> tensor<256x1024xi32>
   return %2 : tensor<256x1024xi32>
 }
diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir
@@ -14,7 +14,7 @@
 !fdt_res = !iree_tensor_ext.dispatch.tensor<writeonly:tensor<512x512xf32>>
 hal.executable.source public @amdaie_fb {
   hal.executable.export public @mm_512_512_4096_bf16_f32 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
-    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
     hal.return %x, %y, %z : index, index, index
   }
   builtin.module {
diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc
@@ -160,9 +160,9 @@ TEST_P(MatMulDispatchTest, DispatchMatmul) {
       binding_table.count, &command_buffer));
   IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
 
-  uint32_t workgroup_count[3] = {1, 1, 1};
   IREE_ASSERT_OK(iree_hal_command_buffer_dispatch(
-      command_buffer, executable_, /*entry_point=*/0, workgroup_count,
+      command_buffer, executable_, /*entry_point=*/0,
+      iree_hal_make_static_dispatch_config(1, 1, 1),
       iree_const_byte_span_empty(), bindings, IREE_HAL_DISPATCH_FLAG_NONE));
 
   IREE_ASSERT_OK(iree_hal_command_buffer_execution_barrier(
diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc
@@ -241,7 +241,7 @@ static iree_status_t iree_hal_xrt_lite_direct_command_buffer_reconfigure(
 static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* base_executable, int32_t entry_point,
-    const uint32_t workgroup_count[3], iree_const_byte_span_t constants,
+    const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
     iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -314,6 +314,5 @@ const iree_hal_command_buffer_vtable_t
         .update_buffer = iree_hal_xrt_lite_direct_command_buffer_update_buffer,
         .copy_buffer = iree_hal_xrt_lite_direct_command_buffer_copy_buffer,
         .dispatch = iree_hal_xrt_lite_direct_command_buffer_dispatch,
-        .dispatch_indirect = unimplemented,
 };
 }  // namespace
diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir
@@ -10,7 +10,7 @@
 >
 hal.executable.source public @amdaie_fb {
   hal.executable.export public @matmul_f32_dispatch_0_matmul_32x32x32_f32 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
-    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice
+    %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
     hal.return %x, %y, %z : index, index, index
   }
   builtin.module {
diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc
@@ -4,6 +4,8 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include <chrono>
+
 #include "iree-amd-aie/driver/xrt/registration/driver_module.h"
 #include "iree/base/api.h"
 #include "iree/base/string_view.h"
@@ -170,9 +172,9 @@ TEST_P(MatMulDispatchTest, DispatchMatmul) {
       binding_table.count, &command_buffer));
   IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
 
-  uint32_t workgroup_count[3] = {1, 1, 1};
   IREE_ASSERT_OK(iree_hal_command_buffer_dispatch(
-      command_buffer, executable_, /*entry_point=*/0, workgroup_count,
+      command_buffer, executable_, /*entry_point=*/0,
+      iree_hal_make_static_dispatch_config(1, 1, 1),
       iree_const_byte_span_empty(), bindings, IREE_HAL_DISPATCH_FLAG_NONE));
 
   IREE_ASSERT_OK(iree_hal_command_buffer_execution_barrier(
diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc
@@ -420,7 +420,7 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_reconfigure(
 static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch(
     iree_hal_command_buffer_t* base_command_buffer,
     iree_hal_executable_t* executable, int32_t entry_point,
-    const uint32_t workgroup_count[3], iree_const_byte_span_t constants,
+    const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
     iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) {
   iree_hal_xrt_direct_command_buffer_t* command_buffer =
       iree_hal_xrt_direct_command_buffer_cast(base_command_buffer);
@@ -463,15 +463,6 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch(
   return iree_ok_status();
 }
 
-static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch_indirect(
-    iree_hal_command_buffer_t* base_command_buffer,
-    iree_hal_executable_t* executable, int32_t entry_point,
-    iree_hal_buffer_ref_t workgroups_ref, iree_const_byte_span_t constants,
-    iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) {
-  return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
-                          "need xrt implementation of dispatch indirect");
-}
-
 namespace {
 const iree_hal_command_buffer_vtable_t
     iree_hal_xrt_direct_command_buffer_vtable = {
@@ -495,7 +486,5 @@ const iree_hal_command_buffer_vtable_t
         /*.collective = */
         iree_hal_xrt_direct_command_buffer_collective,
         /*.dispatch = */ iree_hal_xrt_direct_command_buffer_dispatch,
-        /*.dispatch_indirect = */
-        iree_hal_xrt_direct_command_buffer_dispatch_indirect,
 };
 }  // namespace
diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc
@@ -395,6 +395,7 @@ const iree_hal_device_vtable_t iree_hal_xrt_device_vtable = {
     /*.queue_copy=*/iree_hal_device_queue_emulated_copy,
     /*.queue_read=*/iree_hal_xrt_device_queue_read,
     /*.queue_write = */ iree_hal_xrt_device_queue_write,
+    /*.queue_dispatch=*/iree_hal_device_queue_emulated_dispatch,
     /*.queue_execute = */ iree_hal_xrt_device_queue_execute,
     /*.queue_flush = */ iree_hal_xrt_device_queue_flush,
     /*.wait_semaphores = */ iree_hal_xrt_device_wait_semaphores,
diff --git a/third_party/iree b/third_party/iree
@@ -1 +1 @@
-Subproject commit b7f442b35a4c88ace7f0c03e92ee6e992690b951
+Subproject commit 337c8aaf544053ac512bb7c023c572f8e2d51e7e

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`hal.executable private @test {`
`3`	`3`	`hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_arch = "chip-tbd"}>) {`
`4`	`4`	`hal.executable.export public @test_export ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>]>) count(%arg0: !hal.device) -> (index, index, index) {`
`5`		`- %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice`
	`5`	`+ %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()`
`6`	`6`	`hal.return %x, %y, %z : index, index, index`
`7`	`7`	`}`
`8`	`8`	`builtin.module {`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`!fdt_res = !iree_tensor_ext.dispatch.tensor<writeonly:tensor<512x512xf32>>`
`15`	`15`	`hal.executable.source public @amdaie_fb {`
`16`	`16`	`hal.executable.export public @mm_512_512_4096_bf16_f32 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {`
`17`		`- %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice`
	`17`	`+ %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()`
`18`	`18`	`hal.return %x, %y, %z : index, index, index`
`19`	`19`	`}`
`20`	`20`	`builtin.module {`