Skip to content

Commit 05ed170

Browse files
committed
[mlir][gpu] Add broadcast_lane op
`broadcast_lane` allow to broadcast the value from one lane to all lanes in subgroup. Supported modes: * `first_lane` - broadcast value from the first active lane in subgroup. * `lane` - broadcast value from the specified lane, lane index must be withing subgroup. * `any_lane` - if `src` value is uniform across all the subgroup lanes return it unchanged, otherwise result is poison. This variant essentially an uniformity hint for the compiler, conveying that specific value is uniform across all subgroup lanes. Dropping `any_lane` broadcast will not change the code semantics.
1 parent cdf30f0 commit 05ed170

File tree

7 files changed

+176
-4
lines changed

7 files changed

+176
-4
lines changed

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1517,7 +1517,7 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
15171517
/// Sets the targets of the module.
15181518
void setTargets(ArrayRef<TargetAttrInterface> targets);
15191519
}];
1520-
1520+
15211521
let hasVerifier = 1;
15221522
}
15231523

@@ -3215,4 +3215,46 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
32153215
}];
32163216
}
32173217

3218+
def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
3219+
"a lane to broadcast from",
3220+
[
3221+
I32EnumAttrCase<"first_lane", 0>,
3222+
I32EnumAttrCase<"any_lane", 1>,
3223+
I32EnumAttrCase<"lane", 2>
3224+
]>{
3225+
let genSpecializedAttr = 0;
3226+
let cppNamespace = "::mlir::gpu";
3227+
}
3228+
def GPU_BroadcastTypeAttr : EnumAttr<GPU_Dialect, GPU_BroadcastType, "broadcast">;
3229+
3230+
def GPU_BroadcastLaneOp : GPU_Op<"broadcast_lane",
3231+
[NoMemoryEffect, AllTypesMatch<["result", "src"]>,
3232+
DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
3233+
DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
3234+
ElementwiseMappable.traits>,
3235+
Arguments<(ins AnyType:$src,
3236+
Optional<I32>:$lane,
3237+
GPU_BroadcastTypeAttr:$broadcast_type)> {
3238+
let summary = "Broadcasts a value from the specific lane across subgroup";
3239+
let description = [{
3240+
Broadcasts the value from the one lane to the all lanes in subgroup.
3241+
3242+
The possible broadcats types are:
3243+
3244+
* `first_lane` - first active lane in subgroup.
3245+
* `lane` - from the specified lane, lane index must be withing subgroup.
3246+
* `any_lane` - if `src` value is uniform across all the subgroup
3247+
lanes return it unchanged, otherwise result is poison. This variant
3248+
essentially an uniformity hint for the compiler, conveying that
3249+
specific value is uniform across all subgroup lanes. Dropping `any_lane`
3250+
broadcast will not change the code semantics.
3251+
```
3252+
}];
3253+
let results = (outs AnyType:$result);
3254+
let assemblyFormat = [{
3255+
$src `,` $broadcast_type ($lane^)? attr-dict `:` type($result)
3256+
}];
3257+
let hasVerifier = 1;
3258+
}
3259+
32183260
#endif // GPU_OPS

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,27 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
171171
const amdgpu::Chipset chipset;
172172
};
173173

174+
struct GPUBroadcastLaneOpToROCDL
175+
: public ConvertOpToLLVMPattern<gpu::BroadcastLaneOp> {
176+
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
177+
178+
LogicalResult
179+
matchAndRewrite(gpu::BroadcastLaneOp op, OpAdaptor adaptor,
180+
ConversionPatternRewriter &rewriter) const override {
181+
Value src = adaptor.getSrc();
182+
if (adaptor.getBroadcastType() == gpu::BroadcastType::lane) {
183+
rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
184+
adaptor.getLane());
185+
} else { // first_lane or any_lane
186+
// any_lane is lowered to readfirstlane too, to force value into scalar
187+
// register.
188+
rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
189+
src);
190+
}
191+
return success();
192+
}
193+
};
194+
174195
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
175196
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
176197

@@ -463,7 +484,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
463484
// TODO: Add alignment for workgroup memory
464485
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
465486

466-
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
487+
patterns
488+
.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUBroadcastLaneOpToROCDL>(
489+
converter);
467490
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);
468491

469492
populateMathToROCDLConversionPatterns(converter, patterns);

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2514,6 +2514,43 @@ gpu::YieldOp WarpExecuteOnLane0Op::getTerminator() {
25142514
return cast<gpu::YieldOp>(getBody()->getTerminator());
25152515
}
25162516

2517+
//===----------------------------------------------------------------------===//
2518+
// GPU_BroadcastLaneOp
2519+
//===----------------------------------------------------------------------===//
2520+
2521+
void gpu::BroadcastLaneOp::inferResultRanges(
2522+
ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
2523+
setResultRange(getResult(), argRanges.front());
2524+
}
2525+
2526+
Speculation::Speculatability gpu::BroadcastLaneOp::getSpeculatability() {
2527+
switch (getBroadcastType()) {
2528+
case BroadcastType::first_lane:
2529+
// Cannot speculate first_lane broadcast, because speculating it across
2530+
// control flow can change the active lanes.
2531+
return Speculation::NotSpeculatable;
2532+
case BroadcastType::any_lane:
2533+
LLVM_FALLTHROUGH;
2534+
case BroadcastType::lane:
2535+
return Speculation::Speculatable;
2536+
}
2537+
}
2538+
2539+
LogicalResult gpu::BroadcastLaneOp::verify() {
2540+
switch (getBroadcastType()) {
2541+
case BroadcastType::first_lane:
2542+
LLVM_FALLTHROUGH;
2543+
case BroadcastType::any_lane:
2544+
if (getLane())
2545+
return emitOpError() << "lane can only be specified for lane broadcast";
2546+
return success();
2547+
case BroadcastType::lane:
2548+
if (!getLane())
2549+
return emitOpError() << "lane must be specified for lane broadcast";
2550+
return success();
2551+
}
2552+
}
2553+
25172554
//===----------------------------------------------------------------------===//
25182555
// GPU KernelMetadataAttr
25192556
//===----------------------------------------------------------------------===//

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,3 +802,19 @@ gpu.module @test_module {
802802
func.return %bDimX : index
803803
}
804804
}
805+
806+
// -----
807+
808+
gpu.module @test_module {
809+
// CHECK-LABEL: func @broadcast
810+
// CHECK-SAME: (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32)
811+
func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
812+
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
813+
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
814+
// CHECK: %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
815+
%0 = gpu.broadcast_lane %arg0, first_lane : index
816+
%1 = gpu.broadcast_lane %arg0, any_lane : index
817+
%2 = gpu.broadcast_lane %arg0, lane %arg1 : index
818+
func.return %0, %1, %2 : index, index, index
819+
}
820+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s
2+
3+
func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)
4+
5+
// CHECK-LABEL: func @broadcast_hoisting
6+
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
7+
func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32) {
8+
%c0 = arith.constant 0 : index
9+
%c1 = arith.constant 1 : index
10+
%c10 = arith.constant 10 : index
11+
// CHECK: %[[V1:.*]] = gpu.broadcast_lane %[[ARG]], any_lane : f32
12+
// CHECK: %[[V2:.*]] = gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32
13+
// CHECK: scf.for
14+
// CHECK: %[[V0:.*]] = gpu.broadcast_lane %[[ARG]], first_lane : f32
15+
// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
16+
scf.for %i = %c0 to %c10 step %c1 {
17+
%0 = gpu.broadcast_lane %arg0, first_lane : f32
18+
%1 = gpu.broadcast_lane %arg0, any_lane : f32
19+
%2 = gpu.broadcast_lane %arg0, lane %arg1 : f32
20+
func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
21+
}
22+
func.return
23+
}

mlir/test/Dialect/GPU/int-range-interface.mlir

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,3 +329,22 @@ module attributes {gpu.container_module} {
329329
}
330330
}
331331
}
332+
333+
// -----
334+
335+
// CHECK-LABEL: func @broadcast
336+
func.func @broadcast(%idx: i32) {
337+
%0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
338+
%1 = gpu.broadcast_lane %0, first_lane : index
339+
%2 = gpu.broadcast_lane %0, any_lane : index
340+
%3 = gpu.broadcast_lane %0, lane %idx : index
341+
342+
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
343+
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
344+
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
345+
346+
%4 = test.reflect_bounds %1 : index
347+
%5 = test.reflect_bounds %2 : index
348+
%6 = test.reflect_bounds %3 : index
349+
return
350+
}

mlir/test/Dialect/GPU/ops.mlir

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ module attributes {gpu.container_module} {
126126
// CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
127127
// CHECK-NEXT: gpu.yield %{{.*}} : f32
128128
// CHECK-NEXT: } : (f32) -> f32
129-
%sum2 = gpu.all_reduce %one {
129+
%sum2 = gpu.all_reduce %one {
130130
^bb(%lhs : f32, %rhs : f32):
131131
%tmp = arith.addf %lhs, %rhs : f32
132132
gpu.yield %tmp : f32
@@ -259,7 +259,7 @@ module attributes {gpu.container_module} {
259259
%1 = arith.cmpi slt, %arg0, %arg0 : i32
260260
scf.if %1 {
261261
gpu.printf ", "
262-
}
262+
}
263263
gpu.return
264264
}
265265

@@ -542,3 +542,15 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
542542
}
543543
return %2 : vector<4xi32>
544544
}
545+
546+
// CHECK-LABEL: func @broadcast_lane
547+
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
548+
func.func @broadcast_lane(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
549+
// CHECK: gpu.broadcast_lane %[[ARG]], first_lane : f32
550+
%0 = gpu.broadcast_lane %arg0, first_lane : f32
551+
// CHECK: gpu.broadcast_lane %[[ARG]], any_lane : f32
552+
%1 = gpu.broadcast_lane %arg0, any_lane : f32
553+
// CHECK: gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32
554+
%2 = gpu.broadcast_lane %arg0, lane %arg1 : f32
555+
func.return %0, %1, %2 : f32, f32, f32
556+
}

0 commit comments

Comments
 (0)