[GPU][DT] dce unused tensor.dim ops in SpecializeExports (iree-org#21624)

jtuyls · keshavvinayak01 · commit aa8154fcf645 · 2025-09-04T18:04:06.000Z
Resolves: iree-org#21375. Fixes an issue in e2e llama3 with data-tiling. The SpecializeExports pass creates new tensor.dim operations when retrieving the iteration domain of an operation and leaves them around. When this operates on an encoded tensor, the subsequent MaterializeDeviceEncoding pass will fail on legalizing this tensor.dim operation as it is left around operating on an encoded tensor. We can get around this issue by performing canonicalization (with dce) at the end of SpecializeExports. I also tried adjusting SpecializeExports earlier to clean up the generated tensor.dim operations but that doesn't work as getIterationDomain doesn't give access to all of them. I created an issue and added a TODO to look into that more: iree-org#21623. Signed-off-by: Jorn Tuyls <jorn.tuyls@gmail.com> Signed-off-by: keshavvinayak01 <keshavvinayakjha@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/SpecializeExports.cpp b/compiler/src/iree/compiler/Codegen/Common/SpecializeExports.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Visitors.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #define DEBUG_TYPE "iree-codegen-specialize-exports"
 
@@ -462,6 +463,17 @@ struct SpecializeExportsPass final
       specializeExportedFunctionByRangeAttribute(exportOp, exportedFunc, helper,
                                                  ordinalSet);
     }
+
+    // TODO(#21623): We need DCE after this pass as it can leave around
+    // `tensor.dim` operations that can mess up the next passes (e.g.
+    // MaterializeDeviceEncoding). Ideally, we would avoid creating
+    // those ops altogether if not needed.
+    MLIRContext *ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    tensor::DimOp::getCanonicalizationPatterns(patterns, ctx);
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+    }
   }
 };
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/specialize_exports.mlir b/compiler/src/iree/compiler/Codegen/Common/test/specialize_exports.mlir
@@ -1,5 +1,5 @@
 // RUN: iree-opt %s \
-// RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-specialize-exports, cse)))" \
+// RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-specialize-exports)))" \
 // RUN:   --split-input-file | FileCheck %s
 
 #executable_target_embedded_elf_aarch64 = #hal.executable.target<"llvm-cpu", "embedded-elf-aarch64">
@@ -40,18 +40,18 @@ hal.executable private @single_specialization_executable {
   }
 }
 
+// Note the `CHECK-NOT: tensor.dim` which checks than unused `tensor.dim` ops are eliminated.
+
 // CHECK-LABEL: hal.executable private @single_specialization_executable
 
 //       CHECK:   hal.executable.export public @matmul_transpose_b_Dx1024x4096_f16xf16xf32 ordinal(0)
 //  CHECK-SAME:     condition(%{{.*}}: !hal.device, %[[W:.+]]: index) -> i1
-//   CHECK-DAG:       %[[TRUE:.+]] = arith.constant true
-//       CHECK:       %[[UMIN:.+]] = arith.cmpi ule, %c128, %[[W]]
-//       CHECK:       %[[CMIN:.+]] = arith.andi %[[UMIN]], %[[TRUE]]
-//       CHECK:       %[[UMAX:.+]] = arith.cmpi uge, %c4096, %[[W]]
-//       CHECK:       %[[CMAX:.+]] = arith.andi %[[UMAX]], %[[CMIN]]
-//       CHECK:       %[[UREM:.+]] = arith.remui %[[W]], %c128
-//       CHECK:       %[[UDIV:.+]] = arith.cmpi eq, %[[UREM]], %c0
-//       CHECK:       %[[CDIV:.+]] = arith.andi %[[UDIV]], %[[CMAX]]
+//   CHECK-DAG:       %[[UMAX:.+]] = arith.cmpi uge, %[[W]], %c128
+//   CHECK-DAG:       %[[UMIN:.+]] = arith.cmpi ule, %[[W]], %c4096
+//   CHECK-DAG:       %[[CMIN:.+]] = arith.andi %[[UMIN]], %[[UMAX]]
+//   CHECK-DAG:       %[[UREM:.+]] = arith.remui %[[W]], %c128
+//   CHECK-DAG:       %[[UDIV:.+]] = arith.cmpi eq, %[[UREM]], %c0
+//       CHECK:       %[[CDIV:.+]] = arith.andi %[[UDIV]], %[[CMIN]]
 //       CHECK:       hal.return %[[CDIV]]
 //       CHECK:     fallback(@matmul_transpose_b_Dx1024x4096_f16xf16xf32_0)
 //  CHECK-SAME:     count(%{{[A-Za-z0-9]*}}: !hal.device
@@ -63,8 +63,11 @@ hal.executable private @single_specialization_executable {
 //       CHECK:   builtin.module
 //       CHECK:     func.func @matmul_transpose_b_Dx1024x4096_f16xf16xf32
 //       CHECK:       util.assume.int %{{.*}}<umin = 128, umax = 4096, udiv = 128>
+//   CHECK-NOT:       tensor.dim
 //       CHECK:     func.func @matmul_transpose_b_Dx1024x4096_f16xf16xf32_0
 //       CHECK:       util.assume.int %{{.*}}<umin = 256, umax = 1048320, udiv = 256>
+//   CHECK-NOT:       tensor.dim
+
 
 // -----
 
@@ -178,20 +181,18 @@ hal.executable private @multiple_dimension_assume {
 
 //       CHECK:   hal.executable.export public @matmul_transpose_b_Dx1024x4096_f16xf16xf32 ordinal(0)
 //  CHECK-SAME:     condition(%{{.*}}: !hal.device, %[[W0:[A-Za-z0-9]+]]: index, %[[W1:[A-Za-z0-9]+]]: index) -> i1
-//       CHECK:       %[[TRUE:.+]] = arith.constant true
-//       CHECK:       %[[UMIN:.+]] = arith.cmpi ule, %c128, %[[W0]]
-//       CHECK:       %[[CMIN:.+]] = arith.andi %[[UMIN]], %[[TRUE]]
-//       CHECK:       %[[UMAX:.+]] = arith.cmpi uge, %c4096, %[[W0]]
-//       CHECK:       %[[CMAX:.+]] = arith.andi %[[UMAX]], %[[CMIN]]
-//       CHECK:       %[[UREM:.+]] = arith.remui %[[W0]], %c128
-//       CHECK:       %[[UDIV:.+]] = arith.cmpi eq, %[[UREM]], %c0
-//       CHECK:       %[[CDIV:.+]] = arith.andi %[[UDIV]], %[[CMAX]]
-//       CHECK:       %[[UMIN1:.+]] = arith.cmpi ule, %c128, %[[W1]]
-//       CHECK:       %[[CMIN1:.+]] = arith.andi %[[UMIN1]], %[[CDIV]]
-//       CHECK:       %[[UMAX1:.+]] = arith.cmpi uge, %c4096, %[[W1]]
-//       CHECK:       %[[CMAX1:.+]] = arith.andi %[[UMAX1]], %[[CMIN1]]
-//       CHECK:       %[[UREM1:.+]] = arith.remui %[[W1]], %c128
-//       CHECK:       %[[UDIV1:.+]] = arith.cmpi eq, %[[UREM1]], %c0
+//   CHECK-DAG:       %[[UMAX:.+]] = arith.cmpi uge, %[[W0]], %c128
+//   CHECK-DAG:       %[[UMIN:.+]] = arith.cmpi ule, %[[W0]], %c4096
+//   CHECK-DAG:       %[[CMAX:.+]] = arith.andi %[[UMIN]], %[[UMAX]]
+//   CHECK-DAG:       %[[UREM:.+]] = arith.remui %[[W0]], %c128
+//   CHECK-DAG:       %[[UDIV:.+]] = arith.cmpi eq, %[[UREM]], %c0
+//   CHECK-DAG:       %[[CDIV:.+]] = arith.andi %[[UDIV]], %[[CMAX]]
+//   CHECK-DAG:       %[[UMAX1:.+]] = arith.cmpi uge, %[[W1]], %c128
+//   CHECK-DAG:       %[[CMIN1:.+]] = arith.andi %[[UMAX1]], %[[CDIV]]
+//   CHECK-DAG:       %[[UMIN1:.+]] = arith.cmpi ule, %[[W1]], %c4096
+//   CHECK-DAG:       %[[CMAX1:.+]] = arith.andi %[[UMIN1]], %[[CMIN1]]
+//   CHECK-DAG:       %[[UREM1:.+]] = arith.remui %[[W1]], %c128
+//   CHECK-DAG:       %[[UDIV1:.+]] = arith.cmpi eq, %[[UREM1]], %c0
 //       CHECK:       %[[CDIV1:.+]] = arith.andi %[[UDIV1]], %[[CMAX1]]
 //       CHECK:       hal.return %[[CDIV1]]
 //       CHECK:     fallback(@matmul_transpose_b_Dx1024x4096_f16xf16xf32_0)
@@ -200,17 +201,15 @@ hal.executable private @multiple_dimension_assume {
 
 //       CHECK:   hal.executable.export public @matmul_transpose_b_Dx1024x4096_f16xf16xf32_0 ordinal(1)
 //  CHECK-SAME:     condition(%{{.*}}: !hal.device, %[[W0:[A-Za-z0-9]+]]: index, %[[W1:[A-Za-z0-9]+]]: index) -> i1
-//       CHECK:       %[[TRUE:.+]] = arith.constant true
-//       CHECK:       %[[UMIN:.+]] = arith.cmpi ule, %c4096, %[[W0]]
-//       CHECK:       %[[CMIN:.+]] = arith.andi %[[UMIN]], %[[TRUE]]
-//       CHECK:       %[[UREM:.+]] = arith.remui %[[W0]], %c256
-//       CHECK:       %[[UDIV:.+]] = arith.cmpi eq, %[[UREM]], %c0
-//       CHECK:       %[[CDIV:.+]] = arith.andi %[[UDIV]], %[[CMIN]]
-//       CHECK:       %[[UMIN1:.+]] = arith.cmpi ule, %c4096, %[[W1]]
-//       CHECK:       %[[CMIN1:.+]] = arith.andi %[[UMIN1]], %[[CDIV]]
-//       CHECK:       %[[UREM1:.+]] = arith.remui %[[W1]], %c256
-//       CHECK:       %[[UDIV1:.+]] = arith.cmpi eq, %[[UREM1]], %c0
-//       CHECK:       %[[CDIV1:.+]] = arith.andi %[[UDIV1]], %[[CMIN1]]
+//   CHECK-DAG:       %[[UMAX:.+]] = arith.cmpi uge, %[[W0]], %c4096
+//   CHECK-DAG:       %[[UREM:.+]] = arith.remui %[[W0]], %c256
+//   CHECK-DAG:       %[[UDIV:.+]] = arith.cmpi eq, %[[UREM]], %c0
+//   CHECK-DAG:       %[[CDIV:.+]] = arith.andi %[[UDIV]], %[[UMAX]]
+//   CHECK-DAG:       %[[UMAX1:.+]] = arith.cmpi uge, %[[W1]], %c4096
+//   CHECK-DAG:       %[[CMIN:.+]] = arith.andi %[[UMAX1]], %[[CDIV]]
+//   CHECK-DAG:       %[[UREM1:.+]] = arith.remui %[[W1]], %c256
+//   CHECK-DAG:       %[[UDIV1:.+]] = arith.cmpi eq, %[[UREM1]], %c0
+//       CHECK:       %[[CDIV1:.+]] = arith.andi %[[UDIV1]], %[[CMIN]]
 //       CHECK:       hal.return %[[CDIV1]]
 //       CHECK:     fallback(@matmul_transpose_b_Dx1024x4096_f16xf16xf32_0_1)
 //  CHECK-SAME:     count(%{{[A-Za-z0-9]*}}: !hal.device