Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,10 +453,24 @@ static LogicalResult processParallelLoop(
1, 2,
rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
rewriter.getAffineSymbolExpr(1));
// Map through cloningMap first so we use values valid at the launch
// scope, then ensure they are launch-independent (or cloned constants).
Value mappedStep = cloningMap.lookupOrDefault(step);
Value mappedLowerBound = cloningMap.lookupOrDefault(lowerBound);

mappedStep = ensureLaunchIndependent(mappedStep);
mappedLowerBound = ensureLaunchIndependent(mappedLowerBound);

// If either cannot be made available above the launch, fail gracefully.
if (!mappedStep || !mappedLowerBound) {
return rewriter.notifyMatchFailure(
parallelOp, "lower bound / step must be constant or defined above "
"the gpu.launch");
}

newIndex = AffineApplyOp::create(
rewriter, loc, annotation.getMap().compose(lowerAndStep),
ValueRange{operand, ensureLaunchIndependent(step),
ensureLaunchIndependent(lowerBound)});
ValueRange{operand, mappedStep, mappedLowerBound});
// If there was also a bound, insert that, too.
// TODO: Check that we do not assign bounds twice.
if (annotation.getBound()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// RUN: mlir-opt %s --convert-parallel-loops-to-gpu | FileCheck %s

// Goal: exercise the per-dim index computation
// newIndex = hardware_id * step + lowerBound
// and ensure we see a gpu.launch and an affine.apply (no crash).

module {
func.func @two_dim_parallel_mapped() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index

// Single 2‑D scf.parallel. Each dimension is mapped to a GPU dim.
// We *use* both IVs so the conversion must build indices.
scf.parallel (%bx, %tx) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) {
%u = arith.addi %bx, %c0 : index
%v = arith.addi %tx, %c0 : index
// No explicit terminator: the parser inserts an empty scf.reduce.
} {
mapping = [
#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
]
}
return
}
}

// CHECK-LABEL: func.func @two_dim_parallel_mapped
// CHECK: gpu.launch
// CHECK: affine.apply
24 changes: 24 additions & 0 deletions mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// RUN: mlir-opt %s --convert-parallel-loops-to-gpu | FileCheck %s

module {
func.func @one_dim_parallel_mapped() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

// 1‑D loop mapped to thread_x; use the IV to force index computation.
scf.parallel (%t) = (%c0) to (%c64) step (%c1) {
%w = arith.addi %t, %c0 : index
// Implicit empty scf.reduce terminator.
} {
mapping = [
#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
]
}
return
}
}

// CHECK-LABEL: func.func @one_dim_parallel_mapped
// CHECK: gpu.launch
// CHECK: affine.apply
Loading