-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[mlir][gpu] Add subgroup_broadcast
op
#152808
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1517,7 +1517,7 @@ def GPU_GPUModuleOp : GPU_Op<"module", [ | |
/// Sets the targets of the module. | ||
void setTargets(ArrayRef<TargetAttrInterface> targets); | ||
}]; | ||
|
||
let hasVerifier = 1; | ||
} | ||
|
||
|
@@ -3212,4 +3212,51 @@ def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0", | |
}]; | ||
} | ||
|
||
def GPU_BroadcastType : I32EnumAttr<"BroadcastType", | ||
"a lane to broadcast from", | ||
[ | ||
I32EnumAttrCase<"first_lane", 0>, | ||
I32EnumAttrCase<"any_lane", 1>, | ||
I32EnumAttrCase<"lane", 2> | ||
]>{ | ||
let genSpecializedAttr = 0; | ||
let cppNamespace = "::mlir::gpu"; | ||
} | ||
def GPU_BroadcastTypeAttr : EnumAttr<GPU_Dialect, GPU_BroadcastType, "broadcast">; | ||
|
||
def GPU_BroadcastLaneOp : GPU_Op<"broadcast_lane", | ||
[NoMemoryEffect, AllTypesMatch<["result", "src"]>, | ||
DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>, | ||
DeclareOpInterfaceMethods<ConditionallySpeculatable, ["getSpeculatability"]>] # | ||
ElementwiseMappable.traits>, | ||
Arguments<(ins AnyType:$src, | ||
Optional<I32>:$lane, | ||
GPU_BroadcastTypeAttr:$broadcast_type)> { | ||
let summary = "Broadcasts a value from the specific lane across subgroup"; | ||
let description = [{ | ||
Broadcasts a value from one lane to all lanes in a subgroup. The | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest to describe the following semantics to explicitly explain:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the description |
||
result is guaranteed to be uniform across the subgroup. | ||
|
||
The possible broadcast types are: | ||
|
||
* `first_lane` - broadcasts the value from the first active lane in the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when I first read There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, makes sence There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
subgroup. | ||
* `lane` - broadcasts from the specified lane. The lane index must be | ||
uniform and within the subgroup size. The result is poison if the lane | ||
index is invalid or non-subgroup-uniform. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the semantics if the lane being specified is inactive lane? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Result will be undefined/posion, I will update docs, thanks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please describe the behavior what if the lane specified is out of range (like larger than subgroup size) also. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the descricption |
||
* `any_lane` - broadcasts the value from any lane of the subgroup, | ||
active or inactive, assuming the input is already subgroup uniform. The | ||
result is poison if the input is not uniform. This is useful to convey | ||
uniformity to the compiler to enable more optimizations. Also, it allows | ||
more speculation opportunities than `first_lane` since `first_lane` | ||
results can depend on active lanes which may change during speculation | ||
across control flow. | ||
}]; | ||
let results = (outs AnyType:$result); | ||
let assemblyFormat = [{ | ||
$src `,` $broadcast_type ($lane^)? attr-dict `:` type($result) | ||
}]; | ||
let hasVerifier = 1; | ||
} | ||
|
||
#endif // GPU_OPS |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -160,6 +160,27 @@ struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> { | |
const amdgpu::Chipset chipset; | ||
}; | ||
|
||
struct GPUBroadcastLaneOpToROCDL | ||
: public ConvertOpToLLVMPattern<gpu::BroadcastLaneOp> { | ||
using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; | ||
|
||
LogicalResult | ||
matchAndRewrite(gpu::BroadcastLaneOp op, OpAdaptor adaptor, | ||
ConversionPatternRewriter &rewriter) const override { | ||
Value src = adaptor.getSrc(); | ||
if (adaptor.getBroadcastType() == gpu::BroadcastType::lane) { | ||
rewriter.replaceOpWithNewOp<ROCDL::ReadlaneOp>(op, src.getType(), src, | ||
adaptor.getLane()); | ||
} else { // first_lane or any_lane | ||
// any_lane is lowered to readfirstlane too, to force value into scalar | ||
// register. | ||
rewriter.replaceOpWithNewOp<ROCDL::ReadfirstlaneOp>(op, src.getType(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As the Could you please describe what is the optimization passes you plan to do for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
src); | ||
} | ||
return success(); | ||
} | ||
}; | ||
|
||
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> { | ||
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern; | ||
|
||
|
@@ -453,7 +474,9 @@ void mlir::populateGpuToROCDLConversionPatterns( | |
// TODO: Add alignment for workgroup memory | ||
patterns.add<GPUDynamicSharedMemoryOpLowering>(converter); | ||
|
||
patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter); | ||
patterns | ||
.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUBroadcastLaneOpToROCDL>( | ||
converter); | ||
patterns.add<GPUSubgroupSizeOpToROCDL>(converter, chipset); | ||
|
||
populateMathToROCDLConversionPatterns(converter, patterns); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2511,6 +2511,43 @@ bool WarpExecuteOnLane0Op::areTypesCompatible(Type lhs, Type rhs) { | |
verifyDistributedType(lhs, rhs, getWarpSize(), getOperation())); | ||
} | ||
|
||
//===----------------------------------------------------------------------===// | ||
// GPU_BroadcastLaneOp | ||
//===----------------------------------------------------------------------===// | ||
|
||
void gpu::BroadcastLaneOp::inferResultRanges( | ||
ArrayRef<ConstantIntRanges> argRanges, SetIntRangeFn setResultRange) { | ||
setResultRange(getResult(), argRanges.front()); | ||
} | ||
|
||
Speculation::Speculatability gpu::BroadcastLaneOp::getSpeculatability() { | ||
switch (getBroadcastType()) { | ||
case BroadcastType::first_lane: | ||
// Cannot speculate first_lane broadcast, because speculating it across | ||
// control flow can change the active lanes. | ||
return Speculation::NotSpeculatable; | ||
case BroadcastType::any_lane: | ||
LLVM_FALLTHROUGH; | ||
case BroadcastType::lane: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hold on, is readlane speculatable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From the past discussion #152551 (comment)
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does speculatability only get used in structured control flow, though? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can speculate within a single block for sure, but practically speaking we don'y have anything that speculates across blocks in MLIR AFAIK. This is only being used by LICM. We can say that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. technically, CSE can merge ops across blocks
not sure if it's a problem, though There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If your op semantics include inactive lanes this is fine |
||
return Speculation::Speculatable; | ||
} | ||
} | ||
|
||
LogicalResult gpu::BroadcastLaneOp::verify() { | ||
switch (getBroadcastType()) { | ||
case BroadcastType::first_lane: | ||
LLVM_FALLTHROUGH; | ||
case BroadcastType::any_lane: | ||
if (getLane()) | ||
return emitOpError() << "lane can only be specified for lane broadcast"; | ||
return success(); | ||
case BroadcastType::lane: | ||
if (!getLane()) | ||
return emitOpError() << "lane must be specified for lane broadcast"; | ||
return success(); | ||
} | ||
} | ||
|
||
//===----------------------------------------------------------------------===// | ||
// GPU KernelMetadataAttr | ||
//===----------------------------------------------------------------------===// | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s | ||
|
||
func.func private @side_effect(%arg0 : f32, %arg1 : f32, %arg2 : f32) | ||
|
||
// CHECK-LABEL: func @broadcast_hoisting | ||
// CHECK-SAME: (%[[ARG:.*]]: f32, %[[IDX:.*]]: i32) | ||
func.func @broadcast_hoisting(%arg0 : f32, %arg1 : i32) { | ||
%c0 = arith.constant 0 : index | ||
%c1 = arith.constant 1 : index | ||
%c10 = arith.constant 10 : index | ||
// CHECK: %[[V1:.*]] = gpu.broadcast_lane %[[ARG]], any_lane : f32 | ||
// CHECK: %[[V2:.*]] = gpu.broadcast_lane %[[ARG]], lane %[[IDX]] : f32 | ||
// CHECK: scf.for | ||
// CHECK: %[[V0:.*]] = gpu.broadcast_lane %[[ARG]], first_lane : f32 | ||
// CHECK: func.call @side_effect(%[[V0]], %[[V1]], %[[V2]]) | ||
scf.for %i = %c0 to %c10 step %c1 { | ||
%0 = gpu.broadcast_lane %arg0, first_lane : f32 | ||
%1 = gpu.broadcast_lane %arg0, any_lane : f32 | ||
%2 = gpu.broadcast_lane %arg0, lane %arg1 : f32 | ||
func.call @side_effect(%0, %1, %2) : (f32, f32, f32) -> () | ||
} | ||
func.return | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
consider broadcast_lane to subgroup_uniform_broadcast
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about just
gpu.subgroup_broadcast
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree.
uniform
is ambiguous term and requires better documentation in the op definition. The corresponding SPIRV function actually havenon_uniform
, indicating that it doesn't requires all participating lanes being active.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done