nirvedhmeshram
diff --git a/‎compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp‎
Lines changed: 9 additions & 2 deletions b/‎compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/ApplyBuiltinPDLPatterns.cpp‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir‎
Lines changed: 74 additions & 0 deletions b/‎compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/test/apply_builtin_ukernel_pdl_patterns.mlir‎
Lines changed: 74 additions & 0 deletions
@@ -16,6 +16,7 @@
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
+#include "iree/compiler/Utils/ShapeUtils.h"
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -61,7 +62,8 @@ static LogicalResult annotateOperation(PatternRewriter &rewriter,
     return rewriter.notifyMatchFailure(rootOp,
                                        "expected StringAttr for attr name.");
   }
-  rootOp->setAttr(strName.strref(), annotation);
+  rewriter.modifyOpInPlace(
+      rootOp, [&]() { rootOp->setAttr(strName.strref(), annotation); });
   return success();
 }
 
@@ -75,7 +77,6 @@ static LogicalResult matchContraction(PatternRewriter &rewriter,
     return rewriter.notifyMatchFailure(rootOp,
                                        "not a contraction like linalg op");
   }
-
   if (linalgOp.getIndexingMaps() != indexingMaps) {
     return rewriter.notifyMatchFailure(rootOp, "indexing maps mismatch");
   }
@@ -102,6 +103,9 @@ static LogicalResult dimIsMultipleOf(PatternRewriter &rewriter, Value value,
   if (!dim) {
     return failure();
   }
+  if (dimInt.getInt() >= shapedType.getRank()) {
+    return failure();
+  }
   auto divisorInt = dyn_cast<IntegerAttr>(divisor);
   if (!divisor) {
     return failure();
@@ -140,6 +144,9 @@ static LogicalResult dimIsBound(PatternRewriter &rewriter, Value value,
   if (!dimInt) {
     return failure();
   }
+  if (dimInt.getInt() >= shapedType.getRank()) {
+    return failure();
+  }
   if (auto lowerBoundInt = dyn_cast<IntegerAttr>(lowerBound)) {
     FailureOr<int64_t> constantLb =
         ValueBoundsConstraintSet::computeConstantBound(
 
@@ -133,3 +133,77 @@ func.func @negative_matmul_f8_dynamic_lower_bound(%arg0: index) -> tensor<1x128x
 // CHECK-LABEL: @negative_matmul_f8_dynamic_lower_bound
 // CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
 // CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+
+// -----
+
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @negative_matmul_f16(%arg0: tensor<256x4096xf16>, %arg1: tensor<1024x4096xf16>) -> tensor<256x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<256x1024xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x1024xf32>) -> tensor<256x1024xf32>
+  %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<256x4096xf16>, tensor<1024x4096xf16>) outs(%1 : tensor<256x1024xf32>) {
+    ^bb0(%in: f16, %in_4: f16, %out: f32):
+      %12 = arith.extf %in : f16 to f32
+      %13 = arith.extf %in_4 : f16 to f32
+      %14 = arith.mulf %12, %13 : f32
+      %15 = arith.addf %out, %14 : f32
+      linalg.yield %15 : f32
+    } -> tensor<256x1024xf32>
+  return %2 : tensor<256x1024xf32>
+}
+// CHECK-LABEL: @negative_matmul_f16
+// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
+// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+
+// -----
+
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @negative_matmul_bf16(%arg0: tensor<256x4096xbf16>, %arg1: tensor<1024x4096xbf16>) -> tensor<256x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<256x1024xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<256x1024xf32>) -> tensor<256x1024xf32>
+  %2 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<256x4096xbf16>, tensor<1024x4096xbf16>) outs(%1 : tensor<256x1024xf32>) {
+    ^bb0(%in: bf16, %in_4: bf16, %out: f32):
+      %12 = arith.extf %in : bf16 to f32
+      %13 = arith.extf %in_4 : bf16 to f32
+      %14 = arith.mulf %12, %13 : f32
+      %15 = arith.addf %out, %14 : f32
+      linalg.yield %15 : f32
+    } -> tensor<256x1024xf32>
+  return %2 : tensor<256x1024xf32>
+}
+// CHECK-LABEL: @negative_matmul_bf16
+// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
+// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor
+
+// -----
+
+// The dynamic dimension is a multiple of 256, but doesn't have a lower bound of 256.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+func.func @negative_matmul_bf16_dynamic_lower_bound(%arg0: index) -> tensor<1x256x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = util.assume.int %arg0<umin = 128, udiv = 256> : index
+  %1 = tensor.empty(%0) : tensor<1x256x?xbf16>
+  %2 = tensor.empty(%0) : tensor<1024x?xbf16>
+  %3 = tensor.empty() : tensor<1x256x1024xf32>
+  %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf32>
+  %5 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%1, %2 : tensor<1x256x?xbf16>, tensor<1024x?xbf16>) outs(%4 : tensor<1x256x1024xf32>) {
+  ^bb0(%in: bf16, %in_0: bf16, %out: f32):
+    %6 = arith.extf %in : bf16 to f32
+    %7 = arith.extf %in_0 : bf16 to f32
+    %8 = arith.mulf %6, %7 : f32
+    %9 = arith.addf %out, %8 : f32
+    linalg.yield %9 : f32
+  } -> tensor<1x256x1024xf32>
+  return %5 : tensor<1x256x1024xf32>
+}
+// CHECK-LABEL: @negative_matmul_bf16_dynamic_lower_bound
+// CHECK-NOT:     compilation_info = #iree_codegen.compilation_info
+// CHECK-NOT:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_large_bf16_expanded", tensor>