iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_derived_thread_config.mlir‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_derived_thread_config.mlir‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Dialect/LinalgExt/IR/AggregatedOpInterfaceImpl.cpp‎
Lines changed: 10 additions & 3 deletions b/‎compiler/src/iree/compiler/Dialect/LinalgExt/IR/AggregatedOpInterfaceImpl.cpp‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp‎
Lines changed: 30 additions & 8 deletions b/‎compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td‎
Lines changed: 16 additions & 2 deletions b/‎compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎compiler/src/iree/compiler/Dialect/LinalgExt/IR/test/invalid.mlir‎
Lines changed: 37 additions & 0 deletions b/‎compiler/src/iree/compiler/Dialect/LinalgExt/IR/test/invalid.mlir‎
Lines changed: 37 additions & 0 deletions
@@ -179,6 +179,7 @@ module {
       strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
       m_offset = [0] * [1] k_offset = [0] * [1]
       batch_pos = [0] m_pos = [2, 3] k_pos = [1]
+      input_k_perm = [0, 1, 2]
       ins(%2 : tensor<2x34x34x128xf16>)
       outs(%3 : tensor<2x128x8xf16>) -> tensor<2x128x8xf16>
     return %4 : tensor<2x128x8xf16>
 
@@ -777,6 +777,12 @@ FailureOr<SmallVector<Value>> Im2colOp::decomposeOperation(OpBuilder &b) {
     }
     kBasis.push_back(size);
   }
+
+  // Transpose the order of (P, Q, C) according to `inputKPerm` encoded in
+  // im2col metadata.
+  ArrayRef<int64_t> inputKPerm = getInputKPerm();
+  applyPermutationToVector(kBasis, inputKPerm);
+
   OpFoldResult kIndex = kOffset;
   for (auto [i, ivIdx, stride] :
        llvm::enumerate(getKOutputDims(), getMixedKStrides())) {
@@ -792,17 +798,18 @@ FailureOr<SmallVector<Value>> Im2colOp::decomposeOperation(OpBuilder &b) {
            /*hasOuterBound=*/true)
           .getResults();
   // Split the delinearized offsets into the window offsets (for M offsets)
-  // and the K offsets for the input tensor.
+  // and the K offsets for the input tensor based on the layout.
   SmallVector<Value> windowOffset, inputKOffset;
   int delinKIdx = 0;
+  SmallVector<int64_t> invInputKPerm = invertPermutationVector(inputKPerm);
   for (int i = 0; i < getInputRank(); ++i) {
     if (batchPosSet.contains(i))
       continue;
     if (mPosSet.contains(i)) {
-      windowOffset.push_back(delinKOffset[delinKIdx++]);
+      windowOffset.push_back(delinKOffset[invInputKPerm[delinKIdx++]]);
       continue;
     }
-    inputKOffset.push_back(delinKOffset[delinKIdx++]);
+    inputKOffset.push_back(delinKOffset[invInputKPerm[delinKIdx++]]);
   }
 
   // Compute offsets for extract. The linearized im2col result M offset is
 
@@ -1646,13 +1646,16 @@ SmallVector<int64_t> Im2colOp::getKOutputDims() {
 }
 
 /// Custom builder methods for im2col op.
-void Im2colOp::build(
-    OpBuilder &builder, OperationState &state, Value input, Value output,
-    ArrayRef<int64_t> strides, ArrayRef<int64_t> dilations,
-    ArrayRef<OpFoldResult> kernelSize, ArrayRef<OpFoldResult> mOffset,
-    ArrayRef<OpFoldResult> mStrides, ArrayRef<OpFoldResult> kOffset,
-    ArrayRef<OpFoldResult> kStrides, ArrayRef<int64_t> batchPos,
-    ArrayRef<int64_t> mPos, ArrayRef<int64_t> kPos) {
+void Im2colOp::build(OpBuilder &builder, OperationState &state, Value input,
+                     Value output, ArrayRef<int64_t> strides,
+                     ArrayRef<int64_t> dilations,
+                     ArrayRef<OpFoldResult> kernelSize,
+                     ArrayRef<OpFoldResult> mOffset,
+                     ArrayRef<OpFoldResult> mStrides,
+                     ArrayRef<OpFoldResult> kOffset,
+                     ArrayRef<OpFoldResult> kStrides,
+                     ArrayRef<int64_t> batchPos, ArrayRef<int64_t> mPos,
+                     ArrayRef<int64_t> kPos, ArrayRef<int64_t> inputKPerm) {
   assert(strides.size() == kernelSize.size() &&
          dilations.size() == kernelSize.size() &&
          mPos.size() == kernelSize.size() &&
@@ -1680,7 +1683,8 @@ void Im2colOp::build(
         builder.getDenseI64ArrayAttr(staticKOffset), dynamicKStrides,
         builder.getDenseI64ArrayAttr(staticKStrides),
         builder.getDenseI64ArrayAttr(batchPos),
-        builder.getDenseI64ArrayAttr(mPos), builder.getDenseI64ArrayAttr(kPos));
+        builder.getDenseI64ArrayAttr(mPos), builder.getDenseI64ArrayAttr(kPos),
+        builder.getDenseI64ArrayAttr(inputKPerm));
 }
 
 LogicalResult Im2colOp::verify() {
@@ -1743,6 +1747,7 @@ LogicalResult Im2colOp::verify() {
   ArrayRef<int64_t> strides = getStrides();
   ArrayRef<int64_t> dilations = getDilations();
   SmallVector<OpFoldResult> kernelSize = getMixedKernelSize();
+  ArrayRef<int64_t> inputKPerm = getInputKPerm();
   if (kernelSize.size() != mPos.size()) {
     return op->emitOpError(
         "expected kernel rank to be equal to the m_pos rank");
@@ -1756,6 +1761,23 @@ LogicalResult Im2colOp::verify() {
         "expected dilations rank to be equal to the kernel rank");
   }
 
+  size_t sharedRank = mPos.size() + kPos.size();
+  if (inputKPerm.size() != sharedRank) {
+    return op->emitOpError("expected input_k_perm size (")
+           << inputKPerm.size()
+           << ") to match the number of shared dimensions (m_Pos + k_pos = "
+           << sharedRank << ")";
+  }
+  SmallVector<int64_t> permVec(inputKPerm.begin(), inputKPerm.end());
+  llvm::sort(permVec);
+  for (int64_t i = 0; i < static_cast<int64_t>(sharedRank); ++i) {
+    if (permVec[i] != i) {
+      return op->emitOpError(
+                 "expected input_k_perm to be a permutation of [0, ")
+             << sharedRank << ")";
+    }
+  }
+
   // Verify input and output shapes.
   ArrayRef<int64_t> inputShape = inputType.getShape();
   ArrayRef<int64_t> outputShape = outputType.getShape();
 
@@ -889,6 +889,17 @@ def IREELinalgExt_Im2colOp : IREELinalgExt_Op<"im2col",
     would be 4 for `K0`, and 1 for `K1`, meaning as `K0` increases by 1, the
     index into the flat `K` increases by 4. The strides in M from `m_strides`
     are orthogonal to the strides in `K` from `k_strides`.
+
+    The `input_k_perm` attribute defines the permutation needed to align the
+    reduction dimensions of the input layout with those of the filter layout
+    when computing the K dimension of the im2col output. This is useful when the
+    layout of the filter (e.g., `CHW`) differs from that of the input (e.g., `HWC`).
+    For instance, an `input_k_perm = [2, 0, 1]` indicates the input indices needs
+    to be transposed from `HWC` to `CHW` layout before extracting slices during
+    decomposition. The identity permutation (e.g., input_k_perm = [0, 1, 2])
+    indicates that the input layout is already aligned with the filter layout
+    in terms of reduction dimensions, so no transposition of indices is necessary
+    before slice extraction.
   }];
 
   let arguments = (ins AnyShaped:$input, AnyShaped:$output,
@@ -906,7 +917,8 @@ def IREELinalgExt_Im2colOp : IREELinalgExt_Op<"im2col",
                        DenseI64ArrayAttr:$static_k_strides,
                        DenseI64ArrayAttr:$batch_pos,
                        DenseI64ArrayAttr:$m_pos,
-                       DenseI64ArrayAttr:$k_pos);
+                       DenseI64ArrayAttr:$k_pos,
+                       DenseI64ArrayAttr:$input_k_perm);
 
   let results = (outs Variadic<AnyShaped>:$results);
   let hasFolder = 1;
@@ -925,6 +937,7 @@ def IREELinalgExt_Im2colOp : IREELinalgExt_Op<"im2col",
     `batch_pos` `=` $batch_pos
     `m_pos` `=` $m_pos
     `k_pos` `=` $k_pos
+    `input_k_perm` `=` $input_k_perm
     `ins` `(` $input `:` type($input) `)`
     `outs` `(` $output `:` type($output) `)`
     (`->` type($results)^)?
@@ -941,7 +954,8 @@ def IREELinalgExt_Im2colOp : IREELinalgExt_Op<"im2col",
       "ArrayRef<OpFoldResult>":$k_strides,
       "ArrayRef<int64_t>":$batch_dimensions,
       "ArrayRef<int64_t>":$m_dimensions,
-      "ArrayRef<int64_t>":$k_dimensions)>
+      "ArrayRef<int64_t>":$k_dimensions,
+      "ArrayRef<int64_t>":$input_k_perm)>
   ];
 
   let extraClassDeclaration = extraLinalgExtOpClassDeclaration # [{
 
@@ -699,6 +699,7 @@ func.func @illegal_im2col_strides(%arg0: tensor<2x34x34x640xf32>) -> tensor<2x10
   %1 = iree_linalg_ext.im2col strides = [1] dilations = [1, 1] kernel_size = [3, 3]
            m_offset = [0] * [1] k_offset = [0] * [1]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
   return %1 : tensor<2x1024x5760xf32>
@@ -712,6 +713,7 @@ func.func @illegal_im2col_dilations(%arg0: tensor<2x34x34x640xf32>) -> tensor<2x
   %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1, 1] kernel_size = [3, 3]
            m_offset = [0] * [1] k_offset = [0] * [1]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
   return %1 : tensor<2x1024x5760xf32>
@@ -725,6 +727,7 @@ func.func @illegal_im2col_kernel_size(%arg0: tensor<2x34x34x640xf32>) -> tensor<
   %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3]
            m_offset = [0] * [1] k_offset = [0] * [1]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
   return %1 : tensor<2x1024x5760xf32>
@@ -738,6 +741,7 @@ func.func @illegal_im2col_m_offset(%arg0: tensor<2x34x34x640xf32>) -> tensor<2x1
   %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
            m_offset = [0, 0] * [1] k_offset = [0] * [1]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
   return %1 : tensor<2x1024x5760xf32>
@@ -751,6 +755,7 @@ func.func @illegal_im2col_k_offset(%arg0: tensor<2x34x34x640xf32>) -> tensor<2x1
   %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
            m_offset = [0] * [1] k_offset = [0, 0] * [1]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
   return %1 : tensor<2x1024x5760xf32>
@@ -764,6 +769,7 @@ func.func @illegal_im2col_m_strides(%arg0: tensor<2x34x34x640xf32>) -> tensor<2x
   %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
            m_offset = [0] * [0] k_offset = [0] * [1]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
   return %1 : tensor<2x1024x5760xf32>
@@ -777,6 +783,7 @@ func.func @illegal_im2col_k_strides(%arg0: tensor<2x34x34x640xf32>) -> tensor<2x
   %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
            m_offset = [0] * [1] k_offset = [0] * [2]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
   return %1 : tensor<2x1024x5760xf32>
@@ -790,6 +797,7 @@ func.func @illegal_im2col_input_rank(%arg0: tensor<1x2x34x34x640xf32>) -> tensor
   %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
            m_offset = [0] * [1] k_offset = [0] * [1]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<1x2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
   return %1 : tensor<2x1024x5760xf32>
@@ -803,13 +811,42 @@ func.func @illegal_im2col_output_rank(%arg0: tensor<2x34x34x640xf32>) -> tensor<
   %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
            m_offset = [0] * [1] k_offset = [0] * [1]
            batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1, 2]
            ins(%arg0 : tensor<2x34x34x640xf32>)
            outs(%0 : tensor<2x1024x9x640xf32>) -> tensor<2x1024x9x640xf32>
   return %1 : tensor<2x1024x9x640xf32>
 }
 
 // -----
 
+func.func @illegal_im2col_perm_num(%arg0: tensor<2x34x34x640xf32>) -> tensor<2x1024x5760xf32> {
+  %0 = tensor.empty() : tensor<2x1024x5760xf32>
+  // expected-error @+1 {{expected input_k_perm size (2) to match the number of shared dimensions (m_Pos + k_pos = 3)}}
+  %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
+           m_offset = [0] * [1] k_offset = [0] * [1]
+           batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [0, 1]
+           ins(%arg0 : tensor<2x34x34x640xf32>)
+           outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
+  return %1 : tensor<2x1024x5760xf32>
+}
+
+// -----
+
+func.func @illegal_im2col_perm_value(%arg0: tensor<2x34x34x640xf32>) -> tensor<2x1024x5760xf32> {
+  %0 = tensor.empty() : tensor<2x1024x5760xf32>
+  // expected-error @+1 {{expected input_k_perm to be a permutation of [0, 3)}}
+  %1 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3]
+           m_offset = [0] * [1] k_offset = [0] * [1]
+           batch_pos = [0] m_pos = [1, 2] k_pos = [3]
+           input_k_perm = [1, 2, 3]
+           ins(%arg0 : tensor<2x34x34x640xf32>)
+           outs(%0 : tensor<2x1024x5760xf32>) -> tensor<2x1024x5760xf32>
+  return %1 : tensor<2x1024x5760xf32>
+}
+
+// -----
+
 func.func @illegal_winograd_input_shape(%arg0: tensor<1x10x10x32xf32>) -> tensor<8x8x1x6x6x32xf32> {
   %0 = tensor.empty() : tensor<8x8x1x6x6x32xf32>
   // expected-error @+1 {{incompatible output shape}}