From bb1d794c8a42e6d078b31ca7570845d82ce9905d Mon Sep 17 00:00:00 2001 From: Shashi Shankar Date: Thu, 13 Nov 2025 22:11:54 +0100 Subject: [PATCH] [MLIR][SCFToGPU] Guard operands before AffineApplyOp::create to avoid crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When lowering mapped scf.parallel to gpu.launch, the per-dimension index was built with AffineApplyOp::create using ensureLaunchIndependent(lb/step) directly. If lb/step were not available at the launch scope, that helper returned an empty value and the builder crashed while creating the op. Mirror the bound-handling path: first map lb/step through cloningMap, then call ensureLaunchIndependent. If either operand still isn’t available above the launch, report a precise match-failure instead of crashing. This makes conversion fail cleanly on invalid cases and succeed for valid ones. Add two positive regressions that previously crashed and now lower to gpu.launch (and affine.apply). Fixes: #167654 Signed-off-by: Shashi Shankar --- mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp | 18 +++++++++-- .../parallel-to-gpu-crash-regression.mlir | 31 +++++++++++++++++++ .../parallel-to-gpu-index-creation.mlir | 24 ++++++++++++++ 3 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir create mode 100644 mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index 76a822b05a652..309121f520811 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -453,10 +453,24 @@ static LogicalResult processParallelLoop( 1, 2, rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + rewriter.getAffineSymbolExpr(1)); + // Map through cloningMap first so we use values valid at the launch + // scope, then ensure they are launch-independent (or cloned constants). + Value mappedStep = cloningMap.lookupOrDefault(step); + Value mappedLowerBound = cloningMap.lookupOrDefault(lowerBound); + + mappedStep = ensureLaunchIndependent(mappedStep); + mappedLowerBound = ensureLaunchIndependent(mappedLowerBound); + + // If either cannot be made available above the launch, fail gracefully. + if (!mappedStep || !mappedLowerBound) { + return rewriter.notifyMatchFailure( + parallelOp, "lower bound / step must be constant or defined above " + "the gpu.launch"); + } + newIndex = AffineApplyOp::create( rewriter, loc, annotation.getMap().compose(lowerAndStep), - ValueRange{operand, ensureLaunchIndependent(step), - ensureLaunchIndependent(lowerBound)}); + ValueRange{operand, mappedStep, mappedLowerBound}); // If there was also a bound, insert that, too. // TODO: Check that we do not assign bounds twice. if (annotation.getBound()) { diff --git a/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir new file mode 100644 index 0000000000000..01daed159b6fd --- /dev/null +++ b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir @@ -0,0 +1,31 @@ +// RUN: mlir-opt %s --convert-parallel-loops-to-gpu | FileCheck %s + +// Goal: exercise the per-dim index computation +// newIndex = hardware_id * step + lowerBound +// and ensure we see a gpu.launch and an affine.apply (no crash). + +module { + func.func @two_dim_parallel_mapped() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + + // Single 2‑D scf.parallel. Each dimension is mapped to a GPU dim. + // We *use* both IVs so the conversion must build indices. + scf.parallel (%bx, %tx) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) { + %u = arith.addi %bx, %c0 : index + %v = arith.addi %tx, %c0 : index + // No explicit terminator: the parser inserts an empty scf.reduce. + } { + mapping = [ + #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, + #gpu.loop_dim_map (d0), bound = (d0) -> (d0)> + ] + } + return + } +} + +// CHECK-LABEL: func.func @two_dim_parallel_mapped +// CHECK: gpu.launch +// CHECK: affine.apply diff --git a/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir new file mode 100644 index 0000000000000..55e425a77c18f --- /dev/null +++ b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir @@ -0,0 +1,24 @@ +// RUN: mlir-opt %s --convert-parallel-loops-to-gpu | FileCheck %s + +module { + func.func @one_dim_parallel_mapped() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + + // 1‑D loop mapped to thread_x; use the IV to force index computation. + scf.parallel (%t) = (%c0) to (%c64) step (%c1) { + %w = arith.addi %t, %c0 : index + // Implicit empty scf.reduce terminator. + } { + mapping = [ + #gpu.loop_dim_map (d0), bound = (d0) -> (d0)> + ] + } + return + } +} + +// CHECK-LABEL: func.func @one_dim_parallel_mapped +// CHECK: gpu.launch +// CHECK: affine.apply