Skip to content

Commit b2441ce

Browse files
jtuylsweidel-p
authored andcommitted
[Codegen] Tile memref.copy when vectorizing for dynamic dims (iree-org#22168)
In case of a `memref.copy` with dynamic dimensions, we currently generate scalar code as the operation can't be vectorized: ``` memref.copy %source, %dest : memref<?x4xf32> to memref<?x4xf32> ``` This PR adds logic to tile it first as a `linalg.copy` to get static copies if the dynamic dimension is not the innermost one. The static `memref.copy` is then vectorized if possible. Signed-off-by: Jorn Tuyls <[email protected]> Signed-off-by: Philipp <[email protected]>
1 parent 0231745 commit b2441ce

File tree

7 files changed

+284
-75
lines changed

7 files changed

+284
-75
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeCopyUsingForall.cpp

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
88
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
99
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
10+
#include "iree/compiler/Codegen/Utils/Utils.h"
1011
#include "mlir/Dialect/Affine/IR/AffineOps.h"
1112
#include "mlir/Dialect/Arith/Utils/Utils.h"
1213
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -28,9 +29,6 @@ namespace {
2829
// transfer_read/transfer_write ops.
2930
//====---------------------------------------------------------------------===//
3031

31-
// For optimal performance we always want to copy 128 bits
32-
static constexpr int kPreferredCopyNumBits = 128;
33-
3432
// Moves the copy into a single threaded forall.
3533
static void distributeCopyToSingleThread(RewriterBase &rewriter,
3634
memref::CopyOp copy) {
@@ -113,20 +111,6 @@ static void distributeCopyToThreads(RewriterBase &rewriter, memref::CopyOp copy,
113111
rewriter.replaceOpWithNewOp<memref::CopyOp>(copy, sourceTile, targetTile);
114112
}
115113

116-
static SmallVector<OpFoldResult> getCopyTileSizes(Builder &b,
117-
memref::CopyOp copy) {
118-
int64_t rank = copy.getTarget().getType().getRank();
119-
if (rank == 0) {
120-
return {};
121-
}
122-
123-
SmallVector<OpFoldResult> tileSizes(rank - 1, b.getIndexAttr(1));
124-
int64_t elementBitWidth = llvm::cast<MemRefType>(copy.getTarget().getType())
125-
.getElementTypeBitWidth();
126-
tileSizes.push_back(b.getIndexAttr(kPreferredCopyNumBits / elementBitWidth));
127-
return tileSizes;
128-
}
129-
130114
} // namespace
131115

132116
namespace {

compiler/src/iree/compiler/Codegen/Common/VectorizeMemrefCopy.cpp

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,46 +5,113 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66

77
#include "iree/compiler/Codegen/Common/Passes.h"
8+
#include "iree/compiler/Codegen/Utils/Utils.h"
89
#include "mlir/Dialect/Arith/IR/Arith.h"
910
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
11+
#include "mlir/Dialect/SCF/IR/SCF.h"
1012
#include "mlir/Dialect/Vector/IR/VectorOps.h"
1113
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
1214

15+
#define DEBUG_TYPE "iree-codegen-vectorize-memref-copy"
16+
17+
constexpr char kIsTiled[] = "_is_tiled";
18+
1319
namespace mlir::iree_compiler {
1420

1521
#define GEN_PASS_DEF_VECTORIZEMEMREFCOPYPASS
1622
#include "iree/compiler/Codegen/Common/Passes.h.inc"
1723

1824
namespace {
1925

26+
struct TileLinalgCopy final : OpRewritePattern<memref::CopyOp> {
27+
using Base::Base;
28+
LogicalResult matchAndRewrite(memref::CopyOp copyOp,
29+
PatternRewriter &rewriter) const override {
30+
if (copyOp->hasAttr(kIsTiled)) {
31+
return rewriter.notifyMatchFailure(copyOp, "already tiled");
32+
}
33+
auto linalgCopy = linalg::CopyOp::create(
34+
rewriter, copyOp.getLoc(), copyOp.getSource(), copyOp.getTarget());
35+
std::optional<SmallVector<int64_t>> maybeStaticTileSizes =
36+
getCopyTileSizes(linalgCopy);
37+
if (!maybeStaticTileSizes.has_value()) {
38+
rewriter.eraseOp(linalgCopy);
39+
return rewriter.notifyMatchFailure(copyOp,
40+
"could not retrieve tile sizes");
41+
}
42+
SmallVector<int64_t> staticBounds = linalgCopy.getStaticLoopRanges();
43+
44+
auto tilingInterfaceOp = cast<TilingInterface>(linalgCopy.getOperation());
45+
rewriter.setInsertionPoint(tilingInterfaceOp);
46+
SmallVector<OpFoldResult> tileSizes = getAsIndexOpFoldResult(
47+
rewriter.getContext(), maybeStaticTileSizes.value());
48+
49+
scf::SCFTilingOptions tilingOptions;
50+
tilingOptions.setTileSizes(tileSizes);
51+
tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForOp);
52+
53+
FailureOr<scf::SCFTilingResult> tilingResult =
54+
scf::tileUsingSCF(rewriter, tilingInterfaceOp, tilingOptions);
55+
if (failed(tilingResult)) {
56+
return rewriter.notifyMatchFailure(copyOp, "tiling failed");
57+
}
58+
// Put an marker on the tiled ops, so it's easy to recognize that they
59+
// shouldn't be tiled again.
60+
for (Operation *tiledOp : tilingResult->tiledOps) {
61+
tiledOp->setAttr(kIsTiled, mlir::UnitAttr::get(copyOp.getContext()));
62+
}
63+
if (tilingInterfaceOp->use_empty()) {
64+
rewriter.eraseOp(tilingInterfaceOp);
65+
}
66+
rewriter.eraseOp(copyOp);
67+
return success();
68+
}
69+
};
70+
2071
struct ConvertLinalgCopyToMemrefCopy final : OpRewritePattern<linalg::CopyOp> {
2172
using Base::Base;
2273
LogicalResult matchAndRewrite(linalg::CopyOp copyOp,
2374
PatternRewriter &rewriter) const override {
2475
if (copyOp.hasPureTensorSemantics()) {
2576
return failure();
2677
}
27-
memref::CopyOp::create(rewriter, copyOp.getLoc(),
28-
copyOp.getDpsInputOperand(0)->get(),
29-
copyOp.getDpsInitOperand(0)->get());
78+
auto newCopy = memref::CopyOp::create(rewriter, copyOp.getLoc(),
79+
copyOp.getDpsInputOperand(0)->get(),
80+
copyOp.getDpsInitOperand(0)->get());
81+
newCopy->setAttrs(copyOp->getAttrs());
3082
rewriter.eraseOp(copyOp);
3183
return success();
3284
}
3385
};
3486

87+
/// TODO(#22245): Enable vector masking for unaligned/dynamic copies to improve
88+
/// copy performance further.
3589
struct VectorizeMemrefCopyPass final
3690
: impl::VectorizeMemrefCopyPassBase<VectorizeMemrefCopyPass> {
3791
void getDependentDialects(DialectRegistry &registry) const override {
38-
registry.insert<arith::ArithDialect, vector::VectorDialect>();
92+
registry.insert<arith::ArithDialect, linalg::LinalgDialect, scf::SCFDialect,
93+
vector::VectorDialect>();
3994
}
4095
void runOnOperation() override {
4196
MLIRContext *ctx = &getContext();
4297
auto funcOp = getOperation();
4398

99+
// First convert all `memref.copy` operations to `linalg.copy` so that they
100+
// can be tiled. Tiling them avoids copies with dynamic dimensions if the
101+
// dynamic dimension is not the innermost. Afterwards, tiled `linalg.copy`
102+
// operations are converted back to `memref.copy` operations and vectorized.
44103
RewritePatternSet patterns(ctx);
104+
patterns.add<TileLinalgCopy>(&getContext());
45105
patterns.add<linalg::CopyVectorizationPattern>(&getContext());
46106
patterns.add<ConvertLinalgCopyToMemrefCopy>(&getContext());
47107
(void)applyPatternsGreedily(funcOp, std::move(patterns));
108+
109+
// Clean up the temporary isTiled markers.
110+
funcOp->walk([](memref::CopyOp copyOp) {
111+
if (copyOp->hasAttr(kIsTiled)) {
112+
copyOp->removeAttr(kIsTiled);
113+
}
114+
});
48115
}
49116
};
50117

compiler/src/iree/compiler/Codegen/Common/test/vectorize_memref_copy.mlir

Lines changed: 100 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,14 @@ func.func @memref_copy(%source: memref<2x2xf32>, %dest: memref<2x2xf32>) {
88
// CHECK-LABEL: func.func @memref_copy
99
// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<2x2xf32>
1010
// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<2x2xf32>
11-
// CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE]]
12-
// CHECK: vector.transfer_write %[[RD]], %[[DEST]]
11+
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
12+
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
13+
// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
14+
// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
15+
// CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
16+
// CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
17+
// CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
18+
// CHECK: vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
1319

1420
// -----
1521

@@ -21,5 +27,95 @@ func.func @linalg_copy(%source: memref<2x2xf32>, %dest: memref<2x2xf32>) {
2127
// CHECK-LABEL: func.func @linalg_copy
2228
// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<2x2xf32>
2329
// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<2x2xf32>
24-
// CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE]]
25-
// CHECK: vector.transfer_write %[[RD]], %[[DEST]]
30+
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
31+
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
32+
// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
33+
// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
34+
// CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
35+
// CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [2, 2] [1, 1]
36+
// CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
37+
// CHECK: vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
38+
39+
// -----
40+
41+
// Test with the last dimension larger than and not a multiple of the preferred number of copy elements.
42+
43+
func.func @memref_copy_not_multiple_of_preferred(%source: memref<2x6xf32>, %dest: memref<2x6xf32>) {
44+
memref.copy %source, %dest : memref<2x6xf32> to memref<2x6xf32>
45+
return
46+
}
47+
// CHECK-LABEL: func.func @memref_copy_not_multiple_of_preferred
48+
// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<2x6xf32>
49+
// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<2x6xf32>
50+
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
51+
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
52+
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
53+
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
54+
// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index
55+
// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C2]] step %[[C1]]
56+
// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C6]] step %[[C4]]
57+
// CHECK: %[[MIN:.+]] = affine.min affine_map<(d0) -> (-d0 + 6, 4)>(%[[ARG3]])
58+
// CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
59+
// CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
60+
// CHECK: memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
61+
62+
// -----
63+
64+
// Test with the penultimate dimension larger than and not a multiple of the preferred number of copy elements on that dimension.
65+
66+
func.func @memref_copy_not_multiple_on_penultimate_dim(%source: memref<3x2xf32>, %dest: memref<3x2xf32>) {
67+
memref.copy %source, %dest : memref<3x2xf32> to memref<3x2xf32>
68+
return
69+
}
70+
// CHECK-LABEL: func.func @memref_copy_not_multiple_on_penultimate_dim
71+
// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<3x2xf32>
72+
// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<3x2xf32>
73+
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
74+
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
75+
// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
76+
// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C3]] step %[[C2]]
77+
// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C2]] step %[[C2]]
78+
// CHECK: %[[MIN:.+]] = affine.min affine_map<(d0) -> (-d0 + 3, 2)>(%[[ARG2]])
79+
// CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [%[[MIN]], 2] [1, 1]
80+
// CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [%[[MIN]], 2] [1, 1]
81+
// CHECK: memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]
82+
83+
// -----
84+
85+
func.func @memref_copy_dynamic(%source: memref<?x4xf32>, %dest: memref<?x4xf32>) {
86+
memref.copy %source, %dest : memref<?x4xf32> to memref<?x4xf32>
87+
return
88+
}
89+
// CHECK-LABEL: func.func @memref_copy_dynamic
90+
// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<?x4xf32>
91+
// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<?x4xf32>
92+
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
93+
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
94+
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
95+
// CHECK-DAG: %[[DIM:.+]] = memref.dim %[[SOURCE]], %[[C0]] : memref<?x4xf32>
96+
// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[DIM]] step %[[C1]]
97+
// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C4]] step %[[C4]]
98+
// CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, 4] [1, 1]
99+
// CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, 4] [1, 1]
100+
// CHECK: %[[RD:.+]] = vector.transfer_read %[[SOURCE_SUBVIEW]]
101+
// CHECK: vector.transfer_write %[[RD]], %[[DEST_SUBVIEW]]
102+
103+
// -----
104+
105+
func.func @memref_copy_dynamic_inner_dim(%source: memref<4x?xf32>, %dest: memref<4x?xf32>) {
106+
memref.copy %source, %dest : memref<4x?xf32> to memref<4x?xf32>
107+
return
108+
}
109+
// CHECK-LABEL: func.func @memref_copy_dynamic_inner_dim
110+
// CHECK-SAME: %[[SOURCE:[A-Za-z0-9]+]]: memref<4x?xf32>
111+
// CHECK-SAME: %[[DEST:[A-Za-z0-9]+]]: memref<4x?xf32>
112+
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
113+
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
114+
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
115+
// CHECK-DAG: %[[DIM:.+]] = memref.dim %[[SOURCE]], %[[C1]] : memref<4x?xf32>
116+
// CHECK: scf.for %[[ARG2:.+]] = %[[C0]] to %[[C4]] step %[[C1]]
117+
// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[DIM]] step %[[C4]]
118+
// CHECK: %[[MIN:.+]] = affine.min affine_map<(d0)[s0] -> (-d0 + s0, 4)>(%[[ARG3]])[%[[DIM]]]
119+
// CHECK: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
120+
// CHECK: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[ARG2]], %[[ARG3]]] [1, %[[MIN]]] [1, 1]
121+
// CHECK: memref.copy %[[SOURCE_SUBVIEW]], %[[DEST_SUBVIEW]]

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,7 @@ hal.executable private @main {
231231
// CHECK: gpu.barrier
232232
// CHECK-DAG: %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<4xf16>
233233
// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4xf16>
234-
// CHECK-COUNT-1: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
235-
// CHECK-NOT: scf.for
234+
// CHECK-COUNT-1: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32
236235
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
237236

238237
// -----

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1281,22 +1281,21 @@ hal.executable public @main {
12811281
}
12821282
}
12831283

1284-
// CHECK-LABEL: func @unaligned_to_intrinsic_batched_matmul_nocpromo
1285-
// CHECK-NOT: memref.alloc() {{.*}}xf32
1286-
// CHECK-DAG: memref.alloc() : memref<1x4x66xf32, #gpu.address_space<workgroup>>
1287-
// CHECK-DAG: memref.alloc() : memref<1x16x6xf32, #gpu.address_space<workgroup>>
1288-
// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
1289-
// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
1290-
// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
1291-
// CHECK: scf.forall ({{.*}}) in (12, 37, 10) {
1292-
// CHECK: scf.for %[[IV:.+]] = %c0 to %c144 step %c1 {{.*}} -> (vector<1x1x1x4x1xf32>)
1293-
// CHECK: gpu.barrier
1294-
// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
1295-
// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
1284+
// CHECK-LABEL: func @unaligned_to_intrinsic_batched_matmul_nocpromo
1285+
// CHECK-NOT: memref.alloc() {{.*}}xf32
1286+
// CHECK-DAG: memref.alloc() : memref<1x4x66xf32, #gpu.address_space<workgroup>>
1287+
// CHECK-DAG: memref.alloc() : memref<1x16x6xf32, #gpu.address_space<workgroup>>
1288+
// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(0)
1289+
// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(1)
1290+
// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
1291+
// CHECK: scf.forall ({{.*}}) in (12, 37, 10) {
1292+
// CHECK: scf.for %[[IV:.+]] = %c0 to %c144 step %c1 {{.*}} -> (vector<1x1x1x4x1xf32>)
1293+
// CHECK: gpu.barrier
1294+
// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
1295+
// CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
12961296
// CHECK-COUNT-1: amdgpu.mfma {{.*}}blocks = 1 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32
1297-
// CHECK: scf.yield
1298-
// CHECK-NOT: scf.for
1299-
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
1297+
// CHECK: scf.yield
1298+
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
13001299

13011300
// -----
13021301

0 commit comments

Comments
 (0)