llvm · Hardcode84 · Aug 30, 2025 · Aug 8, 2025 · Aug 9, 2025 · Aug 13, 2025
@@ -1517,7 +1517,7 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
     /// Sets the targets of the module.
     void setTargets(ArrayRef<TargetAttrInterface> targets);
   }];
-  
+
   let hasVerifier = 1;
 }
 
@@ -3215,4 +3215,52 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
   }];
 }
 
+def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
+    "a lane to broadcast from",
+    [
+      I32EnumAttrCase<"first_active_lane", 0>,
+      I32EnumAttrCase<"any_lane", 1>,
+      I32EnumAttrCase<"specific_lane", 2>
+    ]>{
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::gpu";
+}
+def GPU_BroadcastTypeAttr : EnumAttr<GPU_Dialect, GPU_BroadcastType, "broadcast">;
+
+def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
+    [NoMemoryEffect, AllTypesMatch<["result", "src"]>,
+    DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
+    DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
+    ElementwiseMappable.traits>,
+  Arguments<(ins AnyType:$src,
+                 Optional<I32>:$lane,
+                 GPU_BroadcastTypeAttr:$broadcast_type)> {
+  let summary = "Broadcasts a value from the specific lane across subgroup";
+  let description = [{
+      Broadcasts a value from one lane to all active lanes in a subgroup. The
+      result is guaranteed to be uniform across the active lanes in subgroup.
+
+      The possible broadcast types are:
+
+      * `first_active_lane` - broadcasts the value from the first active lane
+      in the subgroup.
+      * `specific_lane` - broadcasts from the specified lane. The lane index
+      must be uniform and within the subgroup size. The result is poison if the
+      lane index is invalid, non subgroup-uniform, or if the source lane is not
+      active.
+      * `any_lane` - broadcasts the value from any lane of the subgroup,
+      assuming the input is already subgroup uniform. The result is poison if
+      the input is not uniform. This is useful to convey uniformity to the
+      compiler to enable more optimizations. Also, it allows more speculation
+      opportunities than `first_active_lane` since `first_active_lane` results
+      can depend on active lanes which may change during speculation across
+      control flow.
+  }];
+  let results = (outs AnyType:$result);
+  let assemblyFormat = [{
+    $src `,` $broadcast_type ($lane^)? attr-dict `:` type($result)
+  }];
+  let hasVerifier = 1;
+}
+
 #endif // GPU_OPS
@@ -171,6 +171,38 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
   const amdgpu::Chipset chipset;
 };
 
+static bool isSupportedReadLaneType(Type type) {
+  // read(first)lane also supports some vector types, but limit it for scalars
+  // for now.
+  return type.isInteger(16) || type.isInteger(32) || type.isInteger(64) ||
+         isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
+             LLVM::LLVMPointerType>(type);
+}
+
+struct GPUSubgroupBroadcastOpToROCDL
+    : public ConvertOpToLLVMPattern<gpu::SubgroupBroadcastOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value src = adaptor.getSrc();
+    if (!isSupportedReadLaneType(src.getType()))
+      return rewriter.notifyMatchFailure(op, "unsupported readlane type");
+
+    if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
+      rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
+                                                     adaptor.getLane());
+    } else { // first_active_lane or any_lane
+      // any_lane is lowered to readfirstlane too, to force value into scalar
+      // register.
+      rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
+                                                          src);
+    }
+    return success();
+  }
+};
+
 struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
 
@@ -463,7 +495,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
   // TODO: Add alignment for workgroup memory
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
-  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
+               GPUSubgroupBroadcastOpToROCDL>(converter);
   patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
 
   populateMathToROCDLConversionPatterns(converter, patterns);

@@ -2514,6 +2514,46 @@ gpu::YieldOp WarpExecuteOnLane0Op::getTerminator() {
   return cast<gpu::YieldOp>(getBody()->getTerminator());
 }
 
+//===----------------------------------------------------------------------===//
+// GPU_SubgroupBroadcastOp
+//===----------------------------------------------------------------------===//
+
+void gpu::SubgroupBroadcastOp::inferResultRanges(
+    ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
+  setResultRange(getResult(), argRanges.front());
+}
+
+Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() {
+  switch (getBroadcastType()) {
+  case BroadcastType::first_active_lane:
+    // Cannot speculate first_lane broadcast, because speculating it across
+    // control flow can change the active lanes.
+    return Speculation::NotSpeculatable;
+  case BroadcastType::any_lane:
+    LLVM_FALLTHROUGH;
+  case BroadcastType::specific_lane:
+    // Speculation should be safe as long as we inside structured control flow.
+    return Speculation::Speculatable;
+  }
+}
+
+LogicalResult gpu::SubgroupBroadcastOp::verify() {
+  switch (getBroadcastType()) {
+  case BroadcastType::first_active_lane:
+    LLVM_FALLTHROUGH;
+  case BroadcastType::any_lane:
+    if (getLane())
+      return emitOpError()
+             << "lane can only be specified for `specific_lane` broadcast";
+    return success();
+  case BroadcastType::specific_lane:
+    if (!getLane())
+      return emitOpError()
+             << "lane must be specified for `specific_lane` broadcast";
+    return success();
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//

@@ -802,3 +802,19 @@ gpu.module @test_module {
     func.return %bDimX : index
   }
 }
+
+// -----
+
+gpu.module @test_module {
+// CHECK-LABEL: func @broadcast
+//  CHECK-SAME:   (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32)
+func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+//       CHECK:   %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
+//       CHECK:   %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
+  %0 = gpu.subgroup_broadcast %arg0, first_active_lane : index
+  %1 = gpu.subgroup_broadcast %arg0, any_lane : index
+  %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
+  func.return %0, %1, %2 : index, index, index
+}
+}
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
+
+func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)
+
+// CHECK-LABEL: func @broadcast_hoisting
+//  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index)
+func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+// `any_lane` and `specific_lane` can be speculated across the control flow, but
+// `first_active_lane` cannot as active lanes can change.
+// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], any_lane : f32
+// CHECK: %[[V2:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
+// CHECK: scf.for
+// CHECK: %[[V0:.*]] = gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
+// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
+  scf.for %i = %c0 to %arg2 step %c1 {
+    %0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
+    %1 = gpu.subgroup_broadcast %arg0, any_lane : f32
+    %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
+    func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
+  }
+  func.return
+}
@@ -329,3 +329,22 @@ module attributes {gpu.container_module} {
     }
   }
 }
+
+// -----
+
+// CHECK-LABEL: func @broadcast
+func.func @broadcast(%idx: i32) {
+  %0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
+  %1 = gpu.subgroup_broadcast %0, first_active_lane : index
+  %2 = gpu.subgroup_broadcast %0, any_lane : index
+  %3 = gpu.subgroup_broadcast %0, specific_lane %idx : index
+
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+  // CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
+
+  %4 = test.reflect_bounds %1 : index
+  %5 = test.reflect_bounds %2 : index
+  %6 = test.reflect_bounds %3 : index
+  return
+}
@@ -126,7 +126,7 @@ module attributes {gpu.container_module} {
       // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
       // CHECK-NEXT: gpu.yield %{{.*}} : f32
       // CHECK-NEXT: } : (f32) -> f32
-      %sum2 = gpu.all_reduce %one { 
+      %sum2 = gpu.all_reduce %one {
       ^bb(%lhs : f32, %rhs : f32):
         %tmp = arith.addf %lhs, %rhs : f32
         gpu.yield %tmp : f32
@@ -259,7 +259,7 @@ module attributes {gpu.container_module} {
       %1 = arith.cmpi slt, %arg0, %arg0 : i32
       scf.if %1 {
         gpu.printf ", "
-      } 
+      }
       gpu.return
     }
 
@@ -542,3 +542,15 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
   }
   return %2 : vector<4xi32>
 }
+
+// CHECK-LABEL: func @subgroup_broadcast
+//  CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
+func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
+  // CHECK: gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
+  %0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
+  // CHECK: gpu.subgroup_broadcast %[[ARG]], any_lane : f32
+  %1 = gpu.subgroup_broadcast %arg0, any_lane : f32
+  // CHECK: gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
+  %2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
+  func.return %0, %1, %2 : f32, f32, f32
+}