Skip to content

Commit c1b2110

Browse files
authored
[mlir][gpu] Add innermost-first policy when mapping loops to GPU IDs (#160634)
1 parent 538325f commit c1b2110

File tree

3 files changed

+120
-36
lines changed

3 files changed

+120
-36
lines changed

mlir/include/mlir/Dialect/GPU/Transforms/Passes.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,19 @@ def GpuMapParallelLoopsPass
3939
encountered to the local workgroup. Within each mapping, the first three
4040
dimensions are mapped to x/y/z hardware ids and all following dimensions are
4141
mapped to sequential loops.
42+
43+
Ordering of the loop mapping against the different dimensions is controlled
44+
by the `mapping-policy` option.
45+
Two policies are supported:
46+
1. `outermost-first` (default): the outermost loop maps to X, then Y
47+
and finally Z.
48+
2. `innermost-first`: the innermost loop maps to X, then Y and finally Z.
4249
}];
4350
let dependentDialects = ["mlir::gpu::GPUDialect"];
51+
let options = [Option<"mappingPolicyStr", "mapping-policy", "std::string",
52+
/*default=*/"\"outermost-first\"",
53+
"Policy outlining how to assign loops to GPU dimensions."
54+
"Supported values are `outermost-first` and `innermost-first`.">];
4455
}
4556

4657
def GpuEliminateBarriers

mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ gpu::setMappingAttr(ParallelOp ploopOp,
5252
namespace gpu {
5353
namespace {
5454
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
55+
enum class MappingPolicy { OutermostFirst, InnermostFirst };
5556
} // namespace
5657

5758
static constexpr int kNumHardwareIds = 3;
@@ -65,16 +66,30 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) {
6566
return mappingLevel;
6667
}
6768

69+
// Map the policy string to a typed mapping policy.
70+
// TODO: Revisit this and possibly use a loop interchange pass instead.
71+
static FailureOr<MappingPolicy> getMappingPolicyFromStr(StringRef policy) {
72+
std::string policyCanonical = policy.trim().lower();
73+
74+
std::optional<MappingPolicy> option =
75+
llvm::StringSwitch<std::optional<MappingPolicy>>(policyCanonical)
76+
.Case("innermost-first", MappingPolicy::InnermostFirst)
77+
.Case("outermost-first", MappingPolicy::OutermostFirst)
78+
.Default(std::nullopt);
79+
80+
if (!option)
81+
return failure();
82+
return *option;
83+
}
84+
6885
/// Computed the hardware id to use for a given mapping level. Will
6986
/// assign x,y and z hardware ids for the first 3 dimensions and use
7087
/// sequential after.
71-
/// TODO: Make this use x for the inner-most loop that is
72-
/// distributed to map to x, the next innermost to y and the next innermost to
73-
/// z.
7488
static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
7589

7690
if (dimension >= kNumHardwareIds || level == Sequential)
7791
return Processor::Sequential;
92+
7893
switch (level) {
7994
case MapGrid:
8095
switch (dimension) {
@@ -107,20 +122,35 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
107122
/// Add mapping information to the given parallel loop. Do not add
108123
/// mapping information if the loop already has it. Also, don't
109124
/// start a mapping at a nested loop.
110-
static void mapParallelOp(ParallelOp parallelOp,
111-
MappingLevel mappingLevel = MapGrid) {
125+
static void
126+
mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid,
127+
MappingPolicy mappingPolicy = MappingPolicy::OutermostFirst) {
112128
// Do not try to add a mapping to already mapped loops or nested loops.
113129
if (parallelOp->getAttr(getMappingAttrName()) ||
114130
((mappingLevel == MapGrid) && parallelOp->getParentOfType<ParallelOp>()))
115131
return;
116132

133+
const int numLoops = static_cast<int>(parallelOp.getNumLoops());
134+
const int loopsToMap = std::min(numLoops, kNumHardwareIds);
135+
117136
MLIRContext *ctx = parallelOp.getContext();
118137
Builder b(ctx);
119138
SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
120-
attrs.reserve(parallelOp.getNumLoops());
121-
for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) {
139+
attrs.reserve(numLoops);
140+
141+
for (int i = 0; i < numLoops; ++i) {
142+
143+
// Determine the mapping to use for this loop.
144+
// If the are more loops to map than HW IDs map to sequential.
145+
int hwMapping = kNumHardwareIds;
146+
if (i < loopsToMap) {
147+
hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst)
148+
? i
149+
: (loopsToMap - 1 - i);
150+
}
151+
122152
attrs.push_back(b.getAttr<ParallelLoopDimMappingAttr>(
123-
getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
153+
getHardwareIdForMapping(mappingLevel, hwMapping), b.getDimIdentityMap(),
124154
b.getDimIdentityMap()));
125155
}
126156
(void)setMappingAttr(parallelOp, attrs);
@@ -129,16 +159,31 @@ static void mapParallelOp(ParallelOp parallelOp,
129159
// walk but just iterate over the operations.
130160
for (Operation &op : *parallelOp.getBody()) {
131161
if (ParallelOp nested = dyn_cast<ParallelOp>(op))
132-
mapParallelOp(nested, mappingLevel);
162+
mapParallelOp(nested, mappingLevel, mappingPolicy);
133163
}
134164
}
135165

136166
namespace {
137167
struct GpuMapParallelLoopsPass
138168
: public impl::GpuMapParallelLoopsPassBase<GpuMapParallelLoopsPass> {
169+
using Base::Base;
170+
139171
void runOnOperation() override {
172+
// Parse the mapping policy.
173+
FailureOr<MappingPolicy> policyOrFailure =
174+
getMappingPolicyFromStr(mappingPolicyStr);
175+
if (failed(policyOrFailure)) {
176+
getOperation()->emitError() << "Invalid mapping policy specified.";
177+
return signalPassFailure();
178+
}
179+
180+
MappingPolicy policy = *policyOrFailure;
181+
MappingLevel topLevel = MappingLevel::MapGrid;
182+
140183
for (Region &region : getOperation()->getRegions()) {
141-
region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
184+
region.walk([&](ParallelOp parallelOp) {
185+
mapParallelOp(parallelOp, topLevel, policy);
186+
});
142187
}
143188
}
144189
};

mlir/test/Dialect/GPU/mapping.mlir

Lines changed: 54 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s
1+
// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s --check-prefix=OUTER
2+
// RUN: mlir-opt -gpu-map-parallel-loops="mapping-policy=innermost-first" -split-input-file %s | FileCheck %s --check-prefix=INNER
23

34
func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
45
%arg3 : index) {
@@ -14,14 +15,23 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
1415
return
1516
}
1617

17-
// CHECK-LABEL: func @parallel_loop(
18-
// CHECK: scf.parallel
19-
// CHECK: scf.parallel
20-
// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
21-
// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
22-
// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
23-
// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
24-
// CHECK-NOT: mapping
18+
// OUTER-LABEL: func @parallel_loop(
19+
// OUTER: scf.parallel
20+
// OUTER: scf.parallel
21+
// OUTER: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
22+
// OUTER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
23+
// OUTER: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
24+
// OUTER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
25+
// OUTER-NOT: mapping
26+
27+
// INNER-LABEL: func @parallel_loop(
28+
// INNER: scf.parallel
29+
// INNER: scf.parallel
30+
// INNER: {mapping = [#gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
31+
// INNER-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
32+
// INNER: {mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
33+
// INNER-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
34+
// INNER-NOT: mapping
2535

2636
// -----
2737

@@ -42,20 +52,38 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
4252
return
4353
}
4454

45-
// CHECK-LABEL: func @parallel_loop_4d(
46-
// CHECK: scf.parallel
47-
// CHECK: scf.parallel
48-
// CHECK: scf.parallel
49-
// CHECK: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
50-
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
51-
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
52-
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
53-
// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
54-
// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
55-
// CHECK-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
56-
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
57-
// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
58-
// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
59-
// CHECK-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
60-
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
61-
// CHECK-NOT: mapping
55+
// OUTER-LABEL: func @parallel_loop_4d(
56+
// OUTER: scf.parallel
57+
// OUTER: scf.parallel
58+
// OUTER: scf.parallel
59+
// OUTER: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
60+
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
61+
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
62+
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
63+
// OUTER: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
64+
// OUTER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
65+
// OUTER-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
66+
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
67+
// OUTER: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
68+
// OUTER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
69+
// OUTER-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
70+
// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
71+
// OUTER-NOT: mapping
72+
73+
// INNER-LABEL: func @parallel_loop_4d(
74+
// INNER: scf.parallel
75+
// INNER: scf.parallel
76+
// INNER: scf.parallel
77+
// INNER: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
78+
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
79+
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
80+
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
81+
// INNER: {mapping = [#gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
82+
// INNER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
83+
// INNER-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
84+
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
85+
// INNER: {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
86+
// INNER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
87+
// INNER-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
88+
// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
89+
// INNER-NOT: mapping

0 commit comments

Comments
 (0)