[Codegen] Disallow padding more skinny matmuls (iree-org#20289)

kuhar · web-flow · commit 8da43fe6a570 · 2025-03-18T12:32:28.000-04:00
Fix up the threshold from iree-org#20284 to disallow other skinny matmuls. Add a TODO to refactor this in the future. Also add debug prints for padding with `--debug-only=iree-encoding-attrs`. --------- Signed-off-by: Jakub Kuderski <jakub@nod-labs.com>
diff --git a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/GPUEncodingExternalModels.cpp b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/GPUEncodingExternalModels.cpp
@@ -449,7 +449,7 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
       return noPaddingAttr;
     }
 
-    // Bail out on matvec / vecmat problems.
+    // Bail out on matvec / vecmat and skinny matmul problems.
     {
       int64_t parallelDimSize = 1;
       ArrayRef<unsigned> parallelDims =
@@ -466,10 +466,11 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
         }
       }
 
-      static constexpr int64_t kMatVecThreshold = 16;
+      // TODO(#19897): Use `getMatmulNarrowDim`.
+      static constexpr int64_t kSkinnyMatmulThreshold = 64;
       if (!ShapedType::isDynamic(parallelDimSize) &&
-          parallelDimSize < kMatVecThreshold) {
-        // This matmul is more similar to a matvec, do not pad.
+          parallelDimSize < kSkinnyMatmulThreshold) {
+        // This matmul is skinny, do not pad.
         return noPaddingAttr;
       }
     }
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
@@ -19,6 +20,10 @@
 
 #include <cassert>
 
+#define DEBUG_TYPE "iree-encoding-attrs"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
 namespace mlir::iree_compiler::IREE::Encoding {
 
 //===---------------------------------------------------------------------===//
@@ -338,6 +343,9 @@ Value PadEncodingLayoutAttr::calculateStorageSizeInBytes(
     ValueRange dynamicDims) const {
   ArrayRef<int32_t> padding = getPadding().asArrayRef();
   assert(padding.size() == type.getRank() && "Invalid padding");
+  LLVM_DEBUG(if (llvm::any_of(padding, [](int32_t x) { return x != 0; })) {
+    llvm::dbgs() << "Non-zero padding: " << type << "\n";
+  });
 
   const int64_t elementSize = getRoundedElementByteWidth(type.getElementType());
   int64_t staticProduct = elementSize;
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir
@@ -103,7 +103,7 @@ util.func public @with_pad_encoding(%arg0: index, %arg1: index, %scalar_f32 : f3
   %2 = stream.tensor.empty on(#hal.device.affinity<@device_a>) : tensor<4096x1337xf16, #encodingA>{} in !stream.resource<*>{%arg1}
   %3 = stream.tensor.empty on(#hal.device.affinity<@device_a>) : tensor<4096x4095xf16, #encodingA>{} in !stream.resource<*>{%arg1}
   %4 = stream.tensor.empty on(#hal.device.affinity<@device_a>) : tensor<4096x250xf16, #encodingA>{} in !stream.resource<*>{%arg1}
-  %5 = stream.tensor.empty on(#hal.device.affinity<@device_a>) : tensor<15x4096xf16, #encodingA>{} in !stream.resource<*>{%arg1}
+  %5 = stream.tensor.empty on(#hal.device.affinity<@device_a>) : tensor<60x4096xf16, #encodingA>{} in !stream.resource<*>{%arg1}
   %6 = stream.tensor.empty on(#hal.device.affinity<@device_a>) : tensor<1x4096xf16, #encodingB>{} in !stream.resource<*>{%arg1}
   %7 = stream.tensor.empty on(#hal.device.affinity<@device_a>) : tensor<?x4096xf16, #encodingA>{%arg0} in !stream.resource<*>{%arg1}
   %8 = stream.tensor.empty on(#hal.device.affinity<@device_a>) : tensor<?x?xf16, #encodingA>{%arg0, %arg1} in !stream.resource<*>{%arg1}
@@ -128,7 +128,7 @@ util.func public @with_pad_encoding(%arg0: index, %arg1: index, %scalar_f32 : f3
 // CHECK: stream.tensor.empty {{.*}} : tensor<4096x1337xf16, #[[$PAD_LHS_1]]>
 // CHECK: stream.tensor.empty {{.*}} : tensor<4096x4095xf16, #[[$PAD_LHS_2]]>
 // CHECK: stream.tensor.empty {{.*}} : tensor<4096x250xf16, #[[$NO_PAD_LHS]]>
-// CHECK: stream.tensor.empty {{.*}} : tensor<15x4096xf16, #[[$NO_PAD_LHS]]>
+// CHECK: stream.tensor.empty {{.*}} : tensor<60x4096xf16, #[[$NO_PAD_LHS]]>
 // CHECK: stream.tensor.empty {{.*}} : tensor<1x4096xf16, #[[$NO_PAD_RHS]]>
 // CHECK: stream.tensor.empty {{.*}} : tensor<?x4096xf16, #[[$PAD_LHS_0]]>
 // CHECK: stream.tensor.empty {{.*}} : tensor<?x?xf16, #[[$NO_PAD_LHS]]>

Original file line number	Diff line number	Diff line change
`@@ -449,7 +449,7 @@ struct GPUPadEncodingLayoutResolverAttrInterface final`
`449`	`449`	`return noPaddingAttr;`
`450`	`450`	`}`
`451`	`451`
`452`		`- // Bail out on matvec / vecmat problems.`
	`452`	`+ // Bail out on matvec / vecmat and skinny matmul problems.`
`453`	`453`	`{`
`454`	`454`	`int64_t parallelDimSize = 1;`
`455`	`455`	`ArrayRef<unsigned> parallelDims =`
`@@ -466,10 +466,11 @@ struct GPUPadEncodingLayoutResolverAttrInterface final`
`466`	`466`	`}`
`467`	`467`	`}`
`468`	`468`
`469`		`- static constexpr int64_t kMatVecThreshold = 16;`
	`469`	+ // TODO(#19897): Use `getMatmulNarrowDim`.
	`470`	`+ static constexpr int64_t kSkinnyMatmulThreshold = 64;`
`470`	`471`	`if (!ShapedType::isDynamic(parallelDimSize) &&`
`471`		`- parallelDimSize < kMatVecThreshold) {`
`472`		`- // This matmul is more similar to a matvec, do not pad.`
	`472`	`+ parallelDimSize < kSkinnyMatmulThreshold) {`
	`473`	`+ // This matmul is skinny, do not pad.`
`473`	`474`	`return noPaddingAttr;`
`474`	`475`	`}`
`475`	`476`	`}`