From bb1d794c8a42e6d078b31ca7570845d82ce9905d Mon Sep 17 00:00:00 2001
From: Shashi Shankar <shashishankar1687@gmail.com>
Date: Thu, 13 Nov 2025 22:11:54 +0100
Subject: [PATCH] [MLIR][SCFToGPU] Guard operands before AffineApplyOp::create
 to avoid crash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When lowering mapped scf.parallel to gpu.launch, the per-dimension index
was built with AffineApplyOp::create using ensureLaunchIndependent(lb/step)
directly. If lb/step were not available at the launch scope, that helper
returned an empty value and the builder crashed while creating the op.

Mirror the bound-handling path: first map lb/step through cloningMap, then
call ensureLaunchIndependent. If either operand still isn’t available above
the launch, report a precise match-failure instead of crashing. This makes
conversion fail cleanly on invalid cases and succeed for valid ones.

Add two positive regressions that previously crashed and now lower to
gpu.launch (and affine.apply).

Fixes: #167654
Signed-off-by: Shashi Shankar <shashishankar1687@gmail.com>
---
 mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp     | 18 +++++++++--
 .../parallel-to-gpu-crash-regression.mlir     | 31 +++++++++++++++++++
 .../parallel-to-gpu-index-creation.mlir       | 24 ++++++++++++++
 3 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir
 create mode 100644 mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir

diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
index 76a822b05a652..309121f520811 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -453,10 +453,24 @@ static LogicalResult processParallelLoop(
           1, 2,
           rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
               rewriter.getAffineSymbolExpr(1));
+      // Map through cloningMap first so we use values valid at the launch
+      // scope, then ensure they are launch-independent (or cloned constants).
+      Value mappedStep = cloningMap.lookupOrDefault(step);
+      Value mappedLowerBound = cloningMap.lookupOrDefault(lowerBound);
+
+      mappedStep = ensureLaunchIndependent(mappedStep);
+      mappedLowerBound = ensureLaunchIndependent(mappedLowerBound);
+
+      // If either cannot be made available above the launch, fail gracefully.
+      if (!mappedStep || !mappedLowerBound) {
+        return rewriter.notifyMatchFailure(
+            parallelOp, "lower bound / step must be constant or defined above "
+                        "the gpu.launch");
+      }
+
       newIndex = AffineApplyOp::create(
           rewriter, loc, annotation.getMap().compose(lowerAndStep),
-          ValueRange{operand, ensureLaunchIndependent(step),
-                     ensureLaunchIndependent(lowerBound)});
+          ValueRange{operand, mappedStep, mappedLowerBound});
       // If there was also a bound, insert that, too.
       // TODO: Check that we do not assign bounds twice.
       if (annotation.getBound()) {
diff --git a/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir
new file mode 100644
index 0000000000000..01daed159b6fd
--- /dev/null
+++ b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s --convert-parallel-loops-to-gpu | FileCheck %s
+
+// Goal: exercise the per-dim index computation
+//        newIndex = hardware_id * step + lowerBound
+// and ensure we see a gpu.launch and an affine.apply (no crash).
+
+module {
+  func.func @two_dim_parallel_mapped() {
+    %c0  = arith.constant 0 : index
+    %c1  = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+
+    // Single 2‑D scf.parallel. Each dimension is mapped to a GPU dim.
+    // We *use* both IVs so the conversion must build indices.
+    scf.parallel (%bx, %tx) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) {
+      %u = arith.addi %bx, %c0 : index
+      %v = arith.addi %tx, %c0 : index
+      // No explicit terminator: the parser inserts an empty scf.reduce.
+    } {
+      mapping = [
+        #gpu.loop_dim_map<processor = block_x,  map = (d0) -> (d0), bound = (d0) -> (d0)>,
+        #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
+      ]
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @two_dim_parallel_mapped
+// CHECK:       gpu.launch
+// CHECK:       affine.apply
diff --git a/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir
new file mode 100644
index 0000000000000..55e425a77c18f
--- /dev/null
+++ b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s --convert-parallel-loops-to-gpu | FileCheck %s
+
+module {
+  func.func @one_dim_parallel_mapped() {
+    %c0  = arith.constant 0 : index
+    %c1  = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+
+    // 1‑D loop mapped to thread_x; use the IV to force index computation.
+    scf.parallel (%t) = (%c0) to (%c64) step (%c1) {
+      %w = arith.addi %t, %c0 : index
+      // Implicit empty scf.reduce terminator.
+    } {
+      mapping = [
+        #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
+      ]
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @one_dim_parallel_mapped
+// CHECK:       gpu.launch
+// CHECK:       affine.apply