Skip to content

Commit 4880940

Browse files
authored
[mlir][gpu] Add subgroup_broadcast op (#152808)
`subgroup_broadcast` allow to broadcast the value from one lane to all lanes in subgroup. Supported modes: * `first_active_lane` - broadcast value from the first active lane in subgroup. * `specific_lane` - broadcast value from the specified lane, lane index must be within subgroup. * `any_lane` - if `src` value is uniform across all the subgroup lanes return it unchanged, otherwise result is poison. This variant essentially an uniformity hint for the compiler, conveying that specific value is uniform across all subgroup lanes. Dropping `any_lane` broadcast should not change the code semantics.
1 parent 35a3ae3 commit 4880940

File tree

7 files changed

+196
-4
lines changed

7 files changed

+196
-4
lines changed

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1517,7 +1517,7 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
15171517
/// Sets the targets of the module.
15181518
void setTargets(ArrayRef<TargetAttrInterface> targets);
15191519
}];
1520-
1520+
15211521
let hasVerifier = 1;
15221522
}
15231523

@@ -3215,4 +3215,52 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
32153215
}];
32163216
}
32173217

3218+
def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
3219+
"a lane to broadcast from",
3220+
[
3221+
I32EnumAttrCase<"first_active_lane", 0>,
3222+
I32EnumAttrCase<"any_lane", 1>,
3223+
I32EnumAttrCase<"specific_lane", 2>
3224+
]>{
3225+
let genSpecializedAttr = 0;
3226+
let cppNamespace = "::mlir::gpu";
3227+
}
3228+
def GPU_BroadcastTypeAttr : EnumAttr<GPU_Dialect, GPU_BroadcastType, "broadcast">;
3229+
3230+
def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
3231+
[NoMemoryEffect, AllTypesMatch<["result", "src"]>,
3232+
DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
3233+
DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
3234+
ElementwiseMappable.traits>,
3235+
Arguments<(ins AnyType:$src,
3236+
Optional<I32>:$lane,
3237+
GPU_BroadcastTypeAttr:$broadcast_type)> {
3238+
let summary = "Broadcasts a value from the specific lane across subgroup";
3239+
let description = [{
3240+
Broadcasts a value from one lane to all active lanes in a subgroup. The
3241+
result is guaranteed to be uniform across the active lanes in subgroup.
3242+
3243+
The possible broadcast types are:
3244+
3245+
* `first_active_lane` - broadcasts the value from the first active lane
3246+
in the subgroup.
3247+
* `specific_lane` - broadcasts from the specified lane. The lane index
3248+
must be uniform and within the subgroup size. The result is poison if the
3249+
lane index is invalid, non subgroup-uniform, or if the source lane is not
3250+
active.
3251+
* `any_lane` - broadcasts the value from any lane of the subgroup,
3252+
assuming the input is already subgroup uniform. The result is poison if
3253+
the input is not uniform. This is useful to convey uniformity to the
3254+
compiler to enable more optimizations. Also, it allows more speculation
3255+
opportunities than `first_active_lane` since `first_active_lane` results
3256+
can depend on active lanes which may change during speculation across
3257+
control flow.
3258+
}];
3259+
let results = (outs AnyType:$result);
3260+
let assemblyFormat = [{
3261+
$src `,` $broadcast_type ($lane^)? attr-dict `:` type($result)
3262+
}];
3263+
let hasVerifier = 1;
3264+
}
3265+
32183266
#endif // GPU_OPS

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,38 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
171171
const amdgpu::Chipset chipset;
172172
};
173173

174+
static bool isSupportedReadLaneType(Type type) {
175+
// read(first)lane also supports some vector types, but limit it for scalars
176+
// for now.
177+
return type.isInteger(16) || type.isInteger(32) || type.isInteger(64) ||
178+
isa<Float16Type, BFloat16Type, Float32Type, Float64Type,
179+
LLVM::LLVMPointerType>(type);
180+
}
181+
182+
struct GPUSubgroupBroadcastOpToROCDL
183+
: public ConvertOpToLLVMPattern<gpu::SubgroupBroadcastOp> {
184+
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
185+
186+
LogicalResult
187+
matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
188+
ConversionPatternRewriter &rewriter) const override {
189+
Value src = adaptor.getSrc();
190+
if (!isSupportedReadLaneType(src.getType()))
191+
return rewriter.notifyMatchFailure(op, "unsupported readlane type");
192+
193+
if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
194+
rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
195+
adaptor.getLane());
196+
} else { // first_active_lane or any_lane
197+
// any_lane is lowered to readfirstlane too, to force value into scalar
198+
// register.
199+
rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
200+
src);
201+
}
202+
return success();
203+
}
204+
};
205+
174206
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
175207
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
176208

@@ -463,7 +495,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
463495
// TODO: Add alignment for workgroup memory
464496
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
465497

466-
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
498+
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
499+
GPUSubgroupBroadcastOpToROCDL>(converter);
467500
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
468501

469502
populateMathToROCDLConversionPatterns(converter, patterns);

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2514,6 +2514,46 @@ gpu::YieldOp WarpExecuteOnLane0Op::getTerminator() {
25142514
return cast<gpu::YieldOp>(getBody()->getTerminator());
25152515
}
25162516

2517+
//===----------------------------------------------------------------------===//
2518+
// GPU_SubgroupBroadcastOp
2519+
//===----------------------------------------------------------------------===//
2520+
2521+
void gpu::SubgroupBroadcastOp::inferResultRanges(
2522+
ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
2523+
setResultRange(getResult(), argRanges.front());
2524+
}
2525+
2526+
Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() {
2527+
switch (getBroadcastType()) {
2528+
case BroadcastType::first_active_lane:
2529+
// Cannot speculate first_lane broadcast, because speculating it across
2530+
// control flow can change the active lanes.
2531+
return Speculation::NotSpeculatable;
2532+
case BroadcastType::any_lane:
2533+
LLVM_FALLTHROUGH;
2534+
case BroadcastType::specific_lane:
2535+
// Speculation should be safe as long as we inside structured control flow.
2536+
return Speculation::Speculatable;
2537+
}
2538+
}
2539+
2540+
LogicalResult gpu::SubgroupBroadcastOp::verify() {
2541+
switch (getBroadcastType()) {
2542+
case BroadcastType::first_active_lane:
2543+
LLVM_FALLTHROUGH;
2544+
case BroadcastType::any_lane:
2545+
if (getLane())
2546+
return emitOpError()
2547+
<< "lane can only be specified for `specific_lane` broadcast";
2548+
return success();
2549+
case BroadcastType::specific_lane:
2550+
if (!getLane())
2551+
return emitOpError()
2552+
<< "lane must be specified for `specific_lane` broadcast";
2553+
return success();
2554+
}
2555+
}
2556+
25172557
//===----------------------------------------------------------------------===//
25182558
// GPU KernelMetadataAttr
25192559
//===----------------------------------------------------------------------===//

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,3 +802,19 @@ gpu.module @test_module {
802802
func.return %bDimX : index
803803
}
804804
}
805+
806+
// -----
807+
808+
gpu.module @test_module {
809+
// CHECK-LABEL: func @broadcast
810+
// CHECK-SAME: (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32)
811+
func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
812+
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
813+
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
814+
// CHECK: %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
815+
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : index
816+
%1 = gpu.subgroup_broadcast %arg0, any_lane : index
817+
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
818+
func.return %0, %1, %2 : index, index, index
819+
}
820+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
2+
3+
func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)
4+
5+
// CHECK-LABEL: func @broadcast_hoisting
6+
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index)
7+
func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) {
8+
%c0 = arith.constant 0 : index
9+
%c1 = arith.constant 1 : index
10+
// `any_lane` and `specific_lane` can be speculated across the control flow, but
11+
// `first_active_lane` cannot as active lanes can change.
12+
// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], any_lane : f32
13+
// CHECK: %[[V2:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
14+
// CHECK: scf.for
15+
// CHECK: %[[V0:.*]] = gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
16+
// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
17+
scf.for %i = %c0 to %arg2 step %c1 {
18+
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
19+
%1 = gpu.subgroup_broadcast %arg0, any_lane : f32
20+
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
21+
func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
22+
}
23+
func.return
24+
}

mlir/test/Dialect/GPU/int-range-interface.mlir

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,3 +329,22 @@ module attributes {gpu.container_module} {
329329
}
330330
}
331331
}
332+
333+
// -----
334+
335+
// CHECK-LABEL: func @broadcast
336+
func.func @broadcast(%idx: i32) {
337+
%0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
338+
%1 = gpu.subgroup_broadcast %0, first_active_lane : index
339+
%2 = gpu.subgroup_broadcast %0, any_lane : index
340+
%3 = gpu.subgroup_broadcast %0, specific_lane %idx : index
341+
342+
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
343+
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
344+
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
345+
346+
%4 = test.reflect_bounds %1 : index
347+
%5 = test.reflect_bounds %2 : index
348+
%6 = test.reflect_bounds %3 : index
349+
return
350+
}

mlir/test/Dialect/GPU/ops.mlir

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ module attributes {gpu.container_module} {
126126
// CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
127127
// CHECK-NEXT: gpu.yield %{{.*}} : f32
128128
// CHECK-NEXT: } : (f32) -> f32
129-
%sum2 = gpu.all_reduce %one {
129+
%sum2 = gpu.all_reduce %one {
130130
^bb(%lhs : f32, %rhs : f32):
131131
%tmp = arith.addf %lhs, %rhs : f32
132132
gpu.yield %tmp : f32
@@ -259,7 +259,7 @@ module attributes {gpu.container_module} {
259259
%1 = arith.cmpi slt, %arg0, %arg0 : i32
260260
scf.if %1 {
261261
gpu.printf ", "
262-
}
262+
}
263263
gpu.return
264264
}
265265

@@ -542,3 +542,15 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
542542
}
543543
return %2 : vector<4xi32>
544544
}
545+
546+
// CHECK-LABEL: func @subgroup_broadcast
547+
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
548+
func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
549+
// CHECK: gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
550+
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
551+
// CHECK: gpu.subgroup_broadcast %[[ARG]], any_lane : f32
552+
%1 = gpu.subgroup_broadcast %arg0, any_lane : f32
553+
// CHECK: gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
554+
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
555+
func.return %0, %1, %2 : f32, f32, f32
556+
}

0 commit comments

Comments
 (0)