llvm · shashforge · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 16, 2025
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -453,10 +453,24 @@ static LogicalResult processParallelLoop(
           1, 2,
           rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
               rewriter.getAffineSymbolExpr(1));
+      // Map through cloningMap first so we use values valid at the launch
+      // scope, then ensure they are launch-independent (or cloned constants).
+      Value mappedStep = cloningMap.lookupOrDefault(step);
+      Value mappedLowerBound = cloningMap.lookupOrDefault(lowerBound);
+
+      mappedStep = ensureLaunchIndependent(mappedStep);
+      mappedLowerBound = ensureLaunchIndependent(mappedLowerBound);
+
+      // If either cannot be made available above the launch, fail gracefully.
+      if (!mappedStep || !mappedLowerBound) {
+        return rewriter.notifyMatchFailure(
+            parallelOp, "lower bound / step must be constant or defined above "
+                        "the gpu.launch");
+      }
+
       newIndex = AffineApplyOp::create(
           rewriter, loc, annotation.getMap().compose(lowerAndStep),
-          ValueRange{operand, ensureLaunchIndependent(step),
-                     ensureLaunchIndependent(lowerBound)});
+          ValueRange{operand, mappedStep, mappedLowerBound});
       // If there was also a bound, insert that, too.
       // TODO: Check that we do not assign bounds twice.
       if (annotation.getBound()) {

diff --git a/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-crash-regression.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s --convert-parallel-loops-to-gpu | FileCheck %s
+
+// Goal: exercise the per-dim index computation
+//        newIndex = hardware_id * step + lowerBound
+// and ensure we see a gpu.launch and an affine.apply (no crash).
+
+module {
+  func.func @two_dim_parallel_mapped() {
+    %c0  = arith.constant 0 : index
+    %c1  = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+
+    // Single 2‑D scf.parallel. Each dimension is mapped to a GPU dim.
+    // We *use* both IVs so the conversion must build indices.
+    scf.parallel (%bx, %tx) = (%c0, %c0) to (%c32, %c32) step (%c1, %c1) {
+      %u = arith.addi %bx, %c0 : index
+      %v = arith.addi %tx, %c0 : index
+      // No explicit terminator: the parser inserts an empty scf.reduce.
+    } {
+      mapping = [
+        #gpu.loop_dim_map<processor = block_x,  map = (d0) -> (d0), bound = (d0) -> (d0)>,
+        #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
+      ]
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @two_dim_parallel_mapped
+// CHECK:       gpu.launch
+// CHECK:       affine.apply
diff --git a/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir b/mlir/test/Conversion/SCFToGPU/parallel-to-gpu-index-creation.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s --convert-parallel-loops-to-gpu | FileCheck %s
+
+module {
+  func.func @one_dim_parallel_mapped() {
+    %c0  = arith.constant 0 : index
+    %c1  = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+
+    // 1‑D loop mapped to thread_x; use the IV to force index computation.
+    scf.parallel (%t) = (%c0) to (%c64) step (%c1) {
+      %w = arith.addi %t, %c0 : index
+      // Implicit empty scf.reduce terminator.
+    } {
+      mapping = [
+        #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>
+      ]
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @one_dim_parallel_mapped
+// CHECK:       gpu.launch
+// CHECK:       affine.apply