Skip to content

[mlir][gpu] Add subgroup_broadcast op #152808

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1517,7 +1517,7 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
/// Sets the targets of the module.
void setTargets(ArrayRef<TargetAttrInterface> targets);
}];

let hasVerifier = 1;
}

Expand Down Expand Up @@ -3212,4 +3212,52 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
}];
}

def GPU_BroadcastType : I32EnumAttr<"BroadcastType",
"a lane to broadcast from",
[
I32EnumAttrCase<"first_active_lane", 0>,
I32EnumAttrCase<"any_lane", 1>,
I32EnumAttrCase<"specific_lane", 2>
]>{
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::gpu";
}
def GPU_BroadcastTypeAttr : EnumAttr<GPU_Dialect, GPU_BroadcastType, "broadcast">;

def GPU_SubgroupBroadcastOp : GPU_Op<"subgroup_broadcast",
[NoMemoryEffect, AllTypesMatch<["result", "src"]>,
DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] #
ElementwiseMappable.traits>,
Arguments<(ins AnyType:$src,
Optional<I32>:$lane,
GPU_BroadcastTypeAttr:$broadcast_type)> {
let summary = "Broadcasts a value from the specific lane across subgroup";
let description = [{
Broadcasts a value from one lane to all active lanes in a subgroup. The
result is guaranteed to be uniform across the active lanes in subgroup.

The possible broadcast types are:

* `first_active_lane` - broadcasts the value from the first active lane
in the subgroup.
* `specific_lane` - broadcasts from the specified lane. The lane index
must be uniform and within the subgroup size. The result is poison if the
lane index is invalid, non subgroup-uniform, or if the source lane is not
active.
* `any_lane` - broadcasts the value from any lane of the subgroup,
assuming the input is already subgroup uniform. The result is poison if
the input is not uniform. This is useful to convey uniformity to the
compiler to enable more optimizations. Also, it allows more speculation
opportunities than `first_active_lane` since `first_active_lane` results
can depend on active lanes which may change during speculation across
control flow.
}];
let results = (outs AnyType:$result);
let assemblyFormat = [{
$src `,` $broadcast_type ($lane^)? attr-dict `:` type($result)
}];
let hasVerifier = 1;
}

#endif // GPU_OPS
24 changes: 23 additions & 1 deletion mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,27 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
const amdgpu::Chipset chipset;
};

struct GPUSubgroupBroadcastOpToROCDL
: public ConvertOpToLLVMPattern<gpu::SubgroupBroadcastOp> {
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;

LogicalResult
matchAndRewrite(gpu::SubgroupBroadcastOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Value src = adaptor.getSrc();
if (adaptor.getBroadcastType() == gpu::BroadcastType::specific_lane) {
rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src,
adaptor.getLane());
} else { // first_active_lane or any_lane
// any_lane is lowered to readfirstlane too, to force value into scalar
// register.
rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the any_lane semantics got lost during lowering, I assume that any_lane is designed to help MLIR optimization at GPU dialect level and target-independent.

Could you please describe what is the optimization passes you plan to do for any_lane, and what is the use case you target for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any_lane allows freely speculate across the control flow, see mlir/test/Dialect/GPU/broadcast-speculatability.mlir test

src);
}
return success();
}
};

struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;

Expand Down Expand Up @@ -453,7 +474,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
// TODO: Add alignment for workgroup memory
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);

patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL,
GPUSubgroupBroadcastOpToROCDL>(converter);
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset);

populateMathToROCDLConversionPatterns(converter, patterns);
Expand Down
39 changes: 39 additions & 0 deletions mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2511,6 +2511,45 @@ bool WarpExecuteOnLane0Op::areTypesCompatible(Type lhs, Type rhs) {
verifyDistributedType(lhs, rhs, getWarpSize(), getOperation()));
}

//===----------------------------------------------------------------------===//
// GPU_SubgroupBroadcastOp
//===----------------------------------------------------------------------===//

void gpu::SubgroupBroadcastOp::inferResultRanges(
ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) {
setResultRange(getResult(), argRanges.front());
}

Speculation::Speculatability gpu::SubgroupBroadcastOp::getSpeculatability() {
switch (getBroadcastType()) {
case BroadcastType::first_active_lane:
// Cannot speculate first_lane broadcast, because speculating it across
// control flow can change the active lanes.
return Speculation::NotSpeculatable;
case BroadcastType::any_lane:
LLVM_FALLTHROUGH;
case BroadcastType::specific_lane:
return Speculation::Speculatable;
}
}

LogicalResult gpu::SubgroupBroadcastOp::verify() {
switch (getBroadcastType()) {
case BroadcastType::first_active_lane:
LLVM_FALLTHROUGH;
case BroadcastType::any_lane:
if (getLane())
return emitOpError()
<< "lane can only be specified for `specific_lane` broadcast";
return success();
case BroadcastType::specific_lane:
if (!getLane())
return emitOpError()
<< "lane must be specified for `specific_lane` broadcast";
return success();
}
}

//===----------------------------------------------------------------------===//
// GPU KernelMetadataAttr
//===----------------------------------------------------------------------===//
Expand Down
18 changes: 17 additions & 1 deletion mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ gpu.module @test_module {
// CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
// CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
// CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
%shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
%shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
// *** UP mode shuffle ***
// CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
// CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
Expand Down Expand Up @@ -776,3 +776,19 @@ gpu.module @test_module {
func.return %bDimX : index
}
}

// -----

gpu.module @test_module {
// CHECK-LABEL: func @broadcast
// CHECK-SAME: (%[[ARG:.*]]: i64, %[[IDX:.*]]: i32)
func.func @broadcast(%arg0 : index, %arg1 : i32) -> (index, index, index) {
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
// CHECK: %{{.*}} = rocdl.readfirstlane %[[ARG]] : i64
// CHECK: %{{.*}} = rocdl.readlane %[[ARG]], %[[IDX]] : (i64, i32) -> i64
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : index
%1 = gpu.subgroup_broadcast %arg0, any_lane : index
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : index
func.return %0, %1, %2 : index, index, index
}
}
24 changes: 24 additions & 0 deletions mlir/test/Dialect/GPU/broadcast-speculatability.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s

func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32)

// CHECK-LABEL: func @broadcast_hoisting
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32, {{.*}}: index)
func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32, %arg2 : index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
// `any_lane` and `specific_lane` can be speculated across the control flow, but
// `first_active_lane` cannot as active lanes can change.
// CHECK: %[[V1:.*]] = gpu.subgroup_broadcast %[[ARG]], any_lane : f32
// CHECK: %[[V2:.*]] = gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
// CHECK: scf.for
// CHECK: %[[V0:.*]] = gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]])
scf.for %i = %c0 to %arg2 step %c1 {
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
%1 = gpu.subgroup_broadcast %arg0, any_lane : f32
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> ()
}
func.return
}
19 changes: 19 additions & 0 deletions mlir/test/Dialect/GPU/int-range-interface.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -329,3 +329,22 @@ module attributes {gpu.container_module} {
}
}
}

// -----

// CHECK-LABEL: func @broadcast
func.func @broadcast(%idx: i32) {
%0 = test.with_bounds { umin = 0 : index, umax = 10 : index, smin = 0 : index, smax = 10 : index } : index
%1 = gpu.subgroup_broadcast %0, first_active_lane : index
%2 = gpu.subgroup_broadcast %0, any_lane : index
%3 = gpu.subgroup_broadcast %0, specific_lane %idx : index

// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}
// CHECK: test.reflect_bounds {smax = 10 : index, smin = 0 : index, umax = 10 : index, umin = 0 : index}

%4 = test.reflect_bounds %1 : index
%5 = test.reflect_bounds %2 : index
%6 = test.reflect_bounds %3 : index
return
}
16 changes: 14 additions & 2 deletions mlir/test/Dialect/GPU/ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ module attributes {gpu.container_module} {
// CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
// CHECK-NEXT: gpu.yield %{{.*}} : f32
// CHECK-NEXT: } : (f32) -> f32
%sum2 = gpu.all_reduce %one {
%sum2 = gpu.all_reduce %one {
^bb(%lhs : f32, %rhs : f32):
%tmp = arith.addf %lhs, %rhs : f32
gpu.yield %tmp : f32
Expand Down Expand Up @@ -259,7 +259,7 @@ module attributes {gpu.container_module} {
%1 = arith.cmpi slt, %arg0, %arg0 : i32
scf.if %1 {
gpu.printf ", "
}
}
gpu.return
}

Expand Down Expand Up @@ -542,3 +542,15 @@ func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4
}
return %2 : vector<4xi32>
}

// CHECK-LABEL: func @subgroup_broadcast
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32)
func.func @subgroup_broadcast(%arg0 : f32, %arg1 : i32) -> (f32, f32, f32) {
// CHECK: gpu.subgroup_broadcast %[[ARG]], first_active_lane : f32
%0 = gpu.subgroup_broadcast %arg0, first_active_lane : f32
// CHECK: gpu.subgroup_broadcast %[[ARG]], any_lane : f32
%1 = gpu.subgroup_broadcast %arg0, any_lane : f32
// CHECK: gpu.subgroup_broadcast %[[ARG]], specific_lane %[[IDX]] : f32
%2 = gpu.subgroup_broadcast %arg0, specific_lane %arg1 : f32
func.return %0, %1, %2 : f32, f32, f32
}