Teach PropagateDispatchSizeBounds about gpu.lane_id (#20456)

lpy · web-flow · commit 6d25b919f985 · 2025-04-07T17:12:47.000-05:00
If subgroup_size is available, update the upper bound of gpu.lane_id. Fixes: #20385
diff --git a/compiler/src/iree/compiler/Codegen/Common/PropagateDispatchSizeBounds.cpp b/compiler/src/iree/compiler/Codegen/Common/PropagateDispatchSizeBounds.cpp
@@ -64,10 +64,16 @@ static void foldConstantBounds(
 
 static void applyBounds(FunctionOpInterface funcOp,
                         ArrayRef<std::optional<int64_t>> workgroupSizes,
-                        ArrayRef<std::optional<int64_t>> workgroupCounts) {
+                        ArrayRef<std::optional<int64_t>> workgroupCounts,
+                        std::optional<uint64_t> subgroupSize) {
   Builder b(funcOp->getContext());
   funcOp->walk([&](Operation *op) {
     TypeSwitch<Operation *>(op)
+        .Case([&](gpu::LaneIdOp laneIdOp) {
+          if (subgroupSize) {
+            laneIdOp.setUpperBoundAttr(b.getIndexAttr(*subgroupSize));
+          }
+        })
         .Case([&](gpu::ThreadIdOp tidOp) {
           std::optional<int64_t> bound =
               workgroupSizes[static_cast<uint32_t>(tidOp.getDimension())];
@@ -132,6 +138,8 @@ struct PropagateDispatchSizeBoundsPass final
     std::optional<SmallVector<int64_t>> staticWorkgroupSize =
         getWorkgroupSize(funcOp);
 
+    std::optional<uint64_t> subgroupSize = getGPUSubgroupSize(funcOp);
+
     // Late in codegen, we've reconciled the workgroup size onto the export op.
     if (std::optional<IREE::HAL::ExecutableExportOp> exportOp =
             getEntryPoint(funcOp)) {
@@ -141,6 +149,11 @@ struct PropagateDispatchSizeBoundsPass final
             llvm::map_to_vector(exportWorkgroupSize->getAsRange<IntegerAttr>(),
                                 [](IntegerAttr a) { return a.getInt(); });
       }
+
+      if (std::optional<uint64_t> exportSubgroupSize =
+              exportOp->getSubgroupSizeAsUInt()) {
+        subgroupSize = exportSubgroupSize;
+      }
     }
 
     if (staticWorkgroupSize) {
@@ -162,7 +175,7 @@ struct PropagateDispatchSizeBoundsPass final
     }
 
     foldConstantBounds(funcOp, staticWorkgroupSize, staticWorkgroupCounts);
-    applyBounds(funcOp, workgroupSizes, workgroupCounts);
+    applyBounds(funcOp, workgroupSizes, workgroupCounts, subgroupSize);
   }
 };
 } // namespace
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/propagate_dispatch_size_bounds.mlir b/compiler/src/iree/compiler/Codegen/Common/test/propagate_dispatch_size_bounds.mlir
@@ -27,6 +27,9 @@ hal.executable private @static {
     builtin.module {
 // CHECK-LABEL: func.func @static()
       func.func @static() {
+// CHECK-NEXT: gpu.lane_id upper_bound 32
+        %lane_id = gpu.lane_id
+
 // CHECK-NEXT: gpu.thread_id x upper_bound 64
 // CHECK-NEXT: gpu.thread_id y upper_bound 2
 // CHECK-NEXT: gpu.thread_id z upper_bound 1
@@ -70,6 +73,42 @@ hal.executable private @static {
 
 // -----
 
+// Note: not the real target definition, missing types
+#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "",
+  wgp = <compute =  fp32,
+    storage =  b32,
+    subgroup =  arithmetic,
+    dot =  none, mma = [],
+    subgroup_size_choices = [32, 64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+
+hal.executable private @manual_subgroup_size {
+  hal.executable.variant public @rocm_hsaco_fb target(#executable_target) {
+    hal.executable.export public @manual_subgroup_size ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index} {
+    ^bb0(%arg0: !hal.device):
+      %c32 = arith.constant 32 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      hal.return %c32, %c8, %c1 : index, index, index
+    }
+    builtin.module {
+// CHECK-LABEL: func.func @manual_subgroup_size()
+      func.func @manual_subgroup_size() {
+// CHECK-NEXT: gpu.lane_id upper_bound 64
+        %lane_id = gpu.lane_id
+
+        return
+      }
+    }
+  }
+}
+
+// -----
+
 #executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
   {iree.gpu.target = #iree_gpu.target<arch = "gfx1100", features = "",
   wgp = <compute =  fp32,